BITSTREAM - RESTORING BILINEAR INTERPOLATION FILTER SUPPORT

Adding appropriate test vector vp90-2-06-bilinear.webm. (cherry picked from commit 68a3e4a888) Change-Id: I10feb6ad2fcb1c2e14e51f550d1a8869aeaf6488
Modified resize unit test to output test vector
2013-10-03 14:09:32 -07:00 · 2013-10-03 14:09:28 -07:00 · 2013-09-16 14:39:45 -07:00 · 2013-09-16 14:23:26 -07:00 · 2013-09-16 12:49:27 -07:00 · 2013-09-13 16:06:27 -07:00
165 changed files with 14398 additions and 7141 deletions
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
@@ -13,20 +13,20 @@
 verbose=0
 set -- $*
 for i; do
-    if [ "$i" == "-o" ]; then
+    if [ "$i" = "-o" ]; then
        on_of=1
-    elif [ "$i" == "-v" ]; then
+    elif [ "$i" = "-v" ]; then
        verbose=1
-    elif [ "$i" == "-g" ]; then
+    elif [ "$i" = "-g" ]; then
        args="${args} --debug"
-    elif [ "$on_of" == "1" ]; then
+    elif [ "$on_of" = "1" ]; then
        outfile=$i
        on_of=0
    elif [ -f "$i" ]; then
        infiles="$infiles $i"
-    elif [ "${i:0:2}" == "-l" ]; then
+    elif [ "${i#-l}" != "$i" ]; then
        libs="$libs ${i#-l}"
-    elif [ "${i:0:2}" == "-L" ]; then
+    elif [ "${i#-L}" != "$i" ]; then
        libpaths="${libpaths} ${i#-L}"
    else
        args="${args} ${i}"
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure.sh
 ##
@@ -198,11 +198,11 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
-enable(){
+enable_feature(){
    set_all yes $*
 }

-disable(){
+disable_feature(){
    set_all no $*
 }

@@ -219,7 +219,7 @@ soft_enable() {
    for var in $*; do
        if ! disabled $var; then
            log_echo "  enabling $var"
-            enable $var
+            enable_feature $var
        fi
    done
 }
@@ -228,7 +228,7 @@ soft_disable() {
    for var in $*; do
        if ! enabled $var; then
            log_echo "  disabling $var"
-            disable $var
+            disable_feature $var
        fi
    done
 }
@@ -251,10 +251,10 @@ tolower(){
 # Temporary File Functions
 #
 source_path=${0%/*}
-enable source_path_used
+enable_feature source_path_used
 if test -z "$source_path" -o "$source_path" = "." ; then
    source_path="`pwd`"
-    disable source_path_used
+    disable_feature source_path_used
 fi

 if test ! -z "$TMPDIR" ; then
@@ -264,12 +264,13 @@ elif test ! -z "$TEMPDIR" ; then
 else
    TMPDIRx="/tmp"
 fi
-TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
-TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
-TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
-TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
-TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
-TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
+RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"

 clean_temp_files() {
    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
@@ -316,8 +317,8 @@ check_header(){
    header=$1
    shift
    var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable $var
-    check_cpp "$@" <<EOF && enable $var
+    disable_feature $var
+    check_cpp "$@" <<EOF && enable_feature $var
 #include "$header"
 int x;
 EOF
@@ -479,7 +480,7 @@ process_common_cmdline() {
    for opt in "$@"; do
        optval="${opt#*=}"
        case "$opt" in
-        --child) enable child
+        --child) enable_feature child
        ;;
        --log*)
        logging="$optval"
@@ -491,7 +492,7 @@ process_common_cmdline() {
        ;;
        --target=*) toolchain="${toolchain:-${optval}}"
        ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable force_toolchain
+        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
        ;;
        --cpu)
        ;;
@@ -511,7 +512,7 @@ process_common_cmdline() {
          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
            die_unknown $opt
        fi
-        $action $option
+        ${action}_feature $option
        ;;
        --require-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
@@ -523,11 +524,11 @@ process_common_cmdline() {
        ;;
        --force-enable-?*|--force-disable-?*)
        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
-        $action $option
+        ${action}_feature $option
        ;;
        --libc=*)
        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        disable builtin_libc
+        disable_feature builtin_libc
        alt_libc="${optval}"
        ;;
        --as=*)
@@ -696,13 +697,13 @@ process_common_toolchain() {

    # Mark the specific ISA requested as enabled
    soft_enable ${tgt_isa}
-    enable ${tgt_os}
-    enable ${tgt_cc}
+    enable_feature ${tgt_os}
+    enable_feature ${tgt_cc}

    # Enable the architecture family
    case ${tgt_isa} in
-        arm*) enable arm;;
-        mips*) enable mips;;
+        arm*) enable_feature arm;;
+        mips*) enable_feature mips;;
    esac

    # PIC is probably what we want when building shared libs
@@ -765,7 +766,7 @@ process_common_toolchain() {
    case ${toolchain} in
        sparc-solaris-*)
            add_extralibs -lposix4
-            disable fast_unaligned
+            disable_feature fast_unaligned
            ;;
        *-solaris-*)
            add_extralibs -lposix4
@@ -790,7 +791,7 @@ process_common_toolchain() {
            ;;
        armv5te)
            soft_enable edsp
-            disable fast_unaligned
+            disable_feature fast_unaligned
            ;;
        esac

@@ -805,7 +806,7 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                if [ -z "${float_abi}" ]; then
                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
@@ -842,8 +843,8 @@ EOF
            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
            AS_SFX=.s
            msvs_arch_dir=arm-msvs
-            disable multithread
-            disable unit_tests
+            disable_feature multithread
+            disable_feature unit_tests
            ;;
        rvct)
            CC=armcc
@@ -855,7 +856,7 @@ EOF
            tune_cflags="--cpu="
            tune_asflags="--cpu="
            if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} == "armv7" ]; then
+                if [ ${tgt_isa} = "armv7" ]; then
                    if enabled neon
                    then
                        check_add_cflags --fpu=softvfp+vfpv3
@@ -880,8 +881,8 @@ EOF

        case ${tgt_os} in
        none*)
-            disable multithread
-            disable os_support
+            disable_feature multithread
+            disable_feature os_support
            ;;

        android*)
@@ -913,9 +914,9 @@ EOF
            # Cortex-A8 implementations (NDK Dev Guide)
            add_ldflags "-Wl,--fix-cortex-a8"

-            enable pic
+            enable_feature pic
            soft_enable realtime_only
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                soft_enable runtime_cpu_detect
            fi
            if enabled runtime_cpu_detect; then
@@ -969,7 +970,7 @@ EOF
         ;;

        linux*)
-            enable linux
+            enable_feature linux
            if enabled rvct; then
                # Check if we have CodeSourcery GCC in PATH. Needed for
                # libraries
@@ -1000,14 +1001,14 @@ EOF
        tune_cflags="-mtune="
        if enabled dspr2; then
            check_add_cflags -mips32r2 -mdspr2
-            disable fast_unaligned
+            disable_feature fast_unaligned
        fi
        check_add_cflags -march=${tgt_isa}
        check_add_asflags -march=${tgt_isa}
        check_add_asflags -KPIC
    ;;
    ppc*)
-        enable ppc
+        enable_feature ppc
        bits=${tgt_isa##ppc}
        link_with_cc=gcc
        setup_gnu_toolchain
@@ -1155,7 +1156,7 @@ EOF
    ;;
    universal*|*-gcc|generic-gnu)
        link_with_cc=gcc
-        enable gcc
+        enable_feature gcc
    setup_gnu_toolchain
    ;;
    esac
@@ -1191,7 +1192,7 @@ EOF

    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
      soft_enable use_x86inc
    fi

@@ -1204,14 +1205,14 @@ EOF
    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0

    # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip
+    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip

    # Try to determine target endianness
    check_cc <<EOF
    unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
    [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
-        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian
+        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian

    # Try to find which inline keywords are supported
    check_cc <<EOF && INLINE="inline"
@@ -1236,7 +1237,7 @@ EOF
            if enabled dspr2; then
                if enabled big_endian; then
                    echo "dspr2 optimizations are available only for little endian platforms"
-                    disable dspr2
+                    disable_feature dspr2
                fi
            fi
        ;;
@@ -1287,8 +1288,8 @@ print_config_h() {

 print_webm_license() {
    local destination=$1
-    local prefix=$2
-    local suffix=$3
+    local prefix="$2"
+    local suffix="$3"
    shift 3
    cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1309,7 +1310,7 @@ process_detect() {
    true;
 }

-enable logging
+enable_feature logging
 logfile="config.log"
 self=$0
 process() {
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/85
+++ b/85
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure
 ##
@@ -38,6 +38,7 @@ Advanced options:
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
+  ${toggle_vp9_postproc}          vp9 specific postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -153,7 +154,7 @@ all_targets="libs examples docs"

 # all targets available are enabled, by default.
 for t in ${all_targets}; do
-    [ -f ${source_path}/${t}.mk ] && enable ${t}
+    [ -f ${source_path}/${t}.mk ] && enable_feature ${t}
 done

 # check installed doxygen version
@@ -164,30 +165,30 @@ if [ ${doxy_major:-0} -ge 1 ]; then
    doxy_minor=${doxy_version%%.*}
    doxy_patch=${doxy_version##*.}

-    [ $doxy_major -gt 1 ] && enable doxygen
-    [ $doxy_minor -gt 5 ] && enable doxygen
-    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable doxygen
+    [ $doxy_major -gt 1 ] && enable_feature doxygen
+    [ $doxy_minor -gt 5 ] && enable_feature doxygen
+    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
 fi

 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
-enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
-enable install_bins
-enable install_libs
+enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs
+enable_feature install_bins
+enable_feature install_libs

-enable static
-enable optimizations
-enable fast_unaligned #allow unaligned accesses, if supported by hw
-enable md5
-enable spatial_resampling
-enable multithread
-enable os_support
-enable temporal_denoising
+enable_feature static
+enable_feature optimizations
+enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
+enable_feature md5
+enable_feature spatial_resampling
+enable_feature multithread
+enable_feature os_support
+enable_feature temporal_denoising

-[ -d ${source_path}/../include ] && enable alt_tree_layout
+[ -d ${source_path}/../include ] && enable_feature alt_tree_layout
 for d in vp8 vp9; do
-    [ -d ${source_path}/${d} ] && disable alt_tree_layout;
+    [ -d ${source_path}/${d} ] && disable_feature alt_tree_layout;
 done

 if ! enabled alt_tree_layout; then
@@ -200,10 +201,10 @@ else
 [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
 [ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
 [ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
-[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
-[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
-[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
-[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder
+[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable_feature vp8_encoder
+[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable_feature vp8_decoder
+[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable_feature vp9_encoder
+[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable_feature vp9_decoder

 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
@@ -279,6 +280,7 @@ CONFIG_LIST="
    dc_recon
    runtime_cpu_detect
    postproc
+    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -333,6 +335,7 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    postproc
+    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -358,12 +361,12 @@ process_cmdline() {
    for opt do
        optval="${opt#*=}"
        case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
+        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
            if enabled experimental; then
-                $action $option
+                ${action}_feature $option
            else
                log_echo "Ignoring $opt -- not in experimental mode."
            fi
@@ -384,8 +387,8 @@ post_process_cmdline() {
    # If the codec family is enabled, enable all components of that family.
    log_echo "Configuring selected codecs"
    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable ${c}
-        enabled ${c%%_*} && enable ${c}
+        disabled ${c%%_*} && disable_feature ${c}
+        enabled ${c%%_*} && enable_feature ${c}
    done

    # Enable all detected codecs, if they haven't been disabled
@@ -393,12 +396,12 @@ post_process_cmdline() {

    # Enable the codec family if any component of that family is enabled
    for c in ${CODECS}; do
-        enabled $c && enable ${c%_*}
+        enabled $c && enable_feature ${c%_*}
    done

    # Set the {en,de}coders variable if any algorithm in that class is enabled
    for c in ${CODECS}; do
-        enabled ${c} && enable ${c##*_}s
+        enabled ${c} && enable_feature ${c##*_}s
    done
 }

@@ -438,7 +441,7 @@ process_targets() {
    done
    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -508,13 +511,13 @@ process_detect() {
    fi
    if [ -z "$CC" ] || enabled external_build; then
        echo "Bypassing toolchain for environment detection."
-        enable external_build
+        enable_feature external_build
        check_header() {
            log fake_check_header "$@"
            header=$1
            shift
            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-            disable $var
+            disable_feature $var
            # Headers common to all environments
            case $header in
                stdio.h)
@@ -526,7 +529,7 @@ process_detect() {
                        [ -f "${d##-I}/$header" ] && result=true && break
                    done
                    ${result:-true}
-            esac && enable $var
+            esac && enable_feature $var

            # Specialize windows and POSIX environments.
            case $toolchain in
@@ -534,7 +537,7 @@ process_detect() {
                    case $header-$toolchain in
                        stdint*-gcc) true;;
                        *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
                    ;;
                *)
                    case $header in
@@ -543,7 +546,7 @@ process_detect() {
                        sys/mman.h) true;;
                        unistd.h) true;;
                        *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
            esac
            enabled $var
        }
@@ -561,7 +564,7 @@ EOF
    check_header sys/mman.h
    check_header unistd.h # for sysconf(3) and friends.

-    check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
+    check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
 }

 process_toolchain() {
@@ -643,14 +646,18 @@ process_toolchain() {
    # ccache only really works on gcc toolchains
    enabled gcc || soft_disable ccache
    if enabled mips; then
-        enable dequant_tokens
-        enable dc_recon
+        enable_feature dequant_tokens
+        enable_feature dc_recon
+    fi
+
+    if enabled internal_stats; then
+        enable_feature vp9_postproc
    fi

    # Enable the postbuild target if building for visual studio.
    case "$tgt_cc" in
-        vs*) enable msvs
-             enable solution
+        vs*) enable_feature msvs
+             enable_feature solution
             vs_version=${tgt_cc##vs}
             case $vs_version in
             [789])
--- a/examples.mk
+++ b/examples.mk
@@ -49,6 +49,9 @@ vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
+UTILS-$(CONFIG_VP8_ENCODER)    += vp9_spatial_scalable_encoder.c
+vp8_scalable_patterns.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
+vp8_scalable_patterns.DESCRIPTION = Spatial Scalable Encoder

 # Clean up old ivfenc, ivfdec binaries.
 ifeq ($(CONFIG_MSVS),yes)
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_ACM_RANDOM_H_
-#define LIBVPX_TEST_ACM_RANDOM_H_
+#ifndef TEST_ACM_RANDOM_H_
+#define TEST_ACM_RANDOM_H_

 #include "third_party/googletest/src/include/gtest/gtest.h"

@@ -59,4 +59,4 @@ class ACMRandom {

 }  // namespace libvpx_test

-#endif  // LIBVPX_TEST_ACM_RANDOM_H_
+#endif  // TEST_ACM_RANDOM_H_
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -29,8 +29,8 @@ class BordersTest : public ::libvpx_test::EncoderTest,

  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
-    if ( video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 0);
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 1);
      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
--- a/test/clear_system_state.h
+++ b/test/clear_system_state.h
@@ -10,7 +10,7 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_

-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 # include "vpx_ports/x86.h"
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <string.h>
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -187,7 +188,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

 protected:
  static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 128;
+  static const int kOuterBlockSize = 256;
  static const int kInputStride = kOuterBlockSize;
  static const int kOutputStride = kOuterBlockSize;
  static const int kMaxDimension = 64;
@@ -224,6 +225,10 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
      input_[i] = prng.Rand8Extremes();
  }

+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+  }
+
  void CheckGuardBlocks() {
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
@@ -456,45 +461,86 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
    { 128}
 };

+/* This test exercises the horizontal and vertical filter functions. */
 TEST_P(ConvolveTest, ChangeFilterWorks) {
  uint8_t* const in = input();
  uint8_t* const out = output();
+
+  /* Assume that the first input sample is at the 8/16th position. */
+  const int kInitialSubPelOffset = 8;
+
+  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
+   * at position -3 with respect to the current filtering position. Since
+   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
+   * which is non-zero only in the last tap. So, applying the filter at the
+   * current input position will result in an output equal to the pixel at
+   * offset +4 (-3 + 7) with respect to the current filtering position.
+   */
  const int kPixelSelected = 4;

+  /* Assume that each output pixel requires us to step on by 17/16th pixels in
+   * the input.
+   */
+  const int kInputPixelStep = 17;
+
+  /* The filters are setup in such a way that the expected output produces
+   * sets of 8 identical output samples. As the filter position moves to the
+   * next 1/16th pixel position the only active (=128) filter tap moves one
+   * position to the left, resulting in the same input pixel being replicated
+   * in to the output for 8 consecutive samples. After each set of 8 positions
+   * the filters select a different input pixel. kFilterPeriodAdjust below
+   * computes which input pixel is written to the output for a specified
+   * x or y position.
+   */
+
+  /* Test the horizontal filter. */
  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[8], 17, kChangeFilters[4], 16,
-                                 Width(), Height()));
+                                 kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, NULL, 0, Width(), Height()));

  for (int x = 0; x < Width(); ++x) {
-    const int kQ4StepAdjust = x >> 4;
    const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
+    const int ref_x =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
  }

+  /* Test the vertical filter. */
  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[4], 16, kChangeFilters[8], 17,
-                                 Width(), Height()));
+                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjust = y >> 4;
    const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
  }

+  /* Test the horizontal and vertical filters in combination. */
  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[8], 17, kChangeFilters[8], 17,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
                                  Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjustY = y >> 4;
    const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjustY * kInputPixelStep)
+                          >> SUBPEL_BITS);
    for (int x = 0; x < Width(); ++x) {
-      const int kQ4StepAdjustX = x >> 4;
      const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;
+      const int ref_x =
+          kPixelSelected + ((kInitialSubPelOffset
+              + kFilterPeriodAdjustX * kInputPixelStep)
+                            >> SUBPEL_BITS);

      ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
          << "x == " << x << ", y == " << y;
@@ -502,6 +548,34 @@ TEST_P(ConvolveTest, ChangeFilterWorks) {
  }
 }

+/* This test exercises that enough rows and columns are filtered with every
+   possible initial fractional positions and scaling steps. */
+TEST_P(ConvolveTest, CheckScalingFiltering) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+
+  SetConstantInput(127);
+
+  for (int frac = 0; frac < 16; ++frac) {
+    for (int step = 1; step <= 32; ++step) {
+      /* Test the horizontal and vertical filters in combination. */
+      REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      Width(), Height()));
+
+      CheckGuardBlocks();
+
+      for (int y = 0; y < Height(); ++y) {
+        for (int x = 0; x < Width(); ++x) {
+          ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
+              << "x == " << x << ", y == " << y
+              << ", frac == " << frac << ", step == " << step;
+        }
+      }
+    }
+  }
+}

 using std::tr1::make_tuple;

--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -108,5 +108,5 @@ using std::tr1::make_tuple;
 VP9_INSTANTIATE_TEST_CASE(
    CpuSpeedTest,
    ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Range(0, 3));
+    ::testing::Range(0, 5));
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -75,7 +75,7 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    bits_in_buffer_model_ -= frame_size_in_bits;

    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits ;
+    bits_total_ += frame_size_in_bits;

    // If first drop not set and we have a drop set it to this time.
    if (!first_drop_ && duration > 1)
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -13,15 +13,16 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"

 extern "C" {
 #include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
 }
-
-#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -31,12 +32,13 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

+const int kNumCoeffs = 256;
 const double PI = 3.1415926535898;
 void reference2_16x16_idct_2d(double *input, double *output) {
  double x;
@@ -45,7 +47,9 @@ void reference2_16x16_idct_2d(double *input, double *output) {
      double s = 0;
      for (int i = 0; i < 16; ++i) {
        for (int j = 0; j < 16; ++j) {
-          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;
+          x = cos(PI * j * (l + 0.5) / 16.0) *
+              cos(PI * i * (k + 0.5) / 16.0) *
+              input[i * 16 + j] / 256;
          if (i != 0)
            x *= sqrt(2.0);
          if (j != 0)
@@ -59,23 +63,23 @@ void reference2_16x16_idct_2d(double *input, double *output) {
 }


-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
+const double C1 = 0.995184726672197;
+const double C2 = 0.98078528040323;
+const double C3 = 0.956940335732209;
+const double C4 = 0.923879532511287;
+const double C5 = 0.881921264348355;
+const double C6 = 0.831469612302545;
+const double C7 = 0.773010453362737;
+const double C8 = 0.707106781186548;
+const double C9 = 0.634393284163646;
+const double C10 = 0.555570233019602;
+const double C11 = 0.471396736825998;
+const double C12 = 0.38268343236509;
+const double C13 = 0.290284677254462;
+const double C14 = 0.195090322016128;
+const double C15 = 0.098017140329561;

-static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  double step[16];
  double intermediate[16];
  double temp1, temp2;
@@ -108,36 +112,36 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[6] = step[1] - step[6];
  output[7] = step[0] - step[7];

-  temp1 = step[ 8]*C7;
-  temp2 = step[15]*C9;
+  temp1 = step[ 8] * C7;
+  temp2 = step[15] * C9;
  output[ 8] = temp1 + temp2;

-  temp1 = step[ 9]*C11;
-  temp2 = step[14]*C5;
+  temp1 = step[ 9] * C11;
+  temp2 = step[14] * C5;
  output[ 9] = temp1 - temp2;

-  temp1 = step[10]*C3;
-  temp2 = step[13]*C13;
+  temp1 = step[10] * C3;
+  temp2 = step[13] * C13;
  output[10] = temp1 + temp2;

-  temp1 = step[11]*C15;
-  temp2 = step[12]*C1;
+  temp1 = step[11] * C15;
+  temp2 = step[12] * C1;
  output[11] = temp1 - temp2;

-  temp1 = step[11]*C1;
-  temp2 = step[12]*C15;
+  temp1 = step[11] * C1;
+  temp2 = step[12] * C15;
  output[12] = temp2 + temp1;

-  temp1 = step[10]*C13;
-  temp2 = step[13]*C3;
+  temp1 = step[10] * C13;
+  temp2 = step[13] * C3;
  output[13] = temp2 - temp1;

-  temp1 = step[ 9]*C5;
-  temp2 = step[14]*C11;
+  temp1 = step[ 9] * C5;
+  temp2 = step[14] * C11;
  output[14] = temp2 + temp1;

-  temp1 = step[ 8]*C9;
-  temp2 = step[15]*C7;
+  temp1 = step[ 8] * C9;
+  temp2 = step[15] * C7;
  output[15] = temp2 - temp1;

  // step 3
@@ -146,20 +150,20 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  step[ 2] = output[1] - output[2];
  step[ 3] = output[0] - output[3];

-  temp1 = output[4]*C14;
-  temp2 = output[7]*C2;
+  temp1 = output[4] * C14;
+  temp2 = output[7] * C2;
  step[ 4] = temp1 + temp2;

-  temp1 = output[5]*C10;
-  temp2 = output[6]*C6;
+  temp1 = output[5] * C10;
+  temp2 = output[6] * C6;
  step[ 5] = temp1 + temp2;

-  temp1 = output[5]*C6;
-  temp2 = output[6]*C10;
+  temp1 = output[5] * C6;
+  temp2 = output[6] * C10;
  step[ 6] = temp2 - temp1;

-  temp1 = output[4]*C2;
-  temp2 = output[7]*C14;
+  temp1 = output[4] * C2;
+  temp2 = output[7] * C14;
  step[ 7] = temp2 - temp1;

  step[ 8] = output[ 8] + output[11];
@@ -176,18 +180,18 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[ 0] = (step[ 0] + step[ 1]);
  output[ 8] = (step[ 0] - step[ 1]);

-  temp1 = step[2]*C12;
-  temp2 = step[3]*C4;
+  temp1 = step[2] * C12;
+  temp2 = step[3] * C4;
  temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1*C8);
+  output[ 4] = 2*(temp1 * C8);

-  temp1 = step[2]*C4;
-  temp2 = step[3]*C12;
+  temp1 = step[2] * C4;
+  temp2 = step[3] * C12;
  temp1 = temp2 - temp1;
-  output[12] = 2*(temp1*C8);
+  output[12] = 2 * (temp1 * C8);

-  output[ 2] = 2*((step[4] + step[ 5])*C8);
-  output[14] = 2*((step[7] - step[ 6])*C8);
+  output[ 2] = 2 * ((step[4] + step[ 5]) * C8);
+  output[14] = 2 * ((step[7] - step[ 6]) * C8);

  temp1 = step[4] - step[5];
  temp2 = step[6] + step[7];
@@ -197,17 +201,17 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  intermediate[8] = step[8] + step[14];
  intermediate[9] = step[9] + step[15];

-  temp1 = intermediate[8]*C12;
-  temp2 = intermediate[9]*C4;
+  temp1 = intermediate[8] * C12;
+  temp2 = intermediate[9] * C4;
  temp1 = temp1 - temp2;
-  output[3] = 2*(temp1*C8);
+  output[3] = 2 * (temp1 * C8);

-  temp1 = intermediate[8]*C4;
-  temp2 = intermediate[9]*C12;
+  temp1 = intermediate[8] * C4;
+  temp2 = intermediate[9] * C12;
  temp1 = temp2 + temp1;
-  output[13] = 2*(temp1*C8);
+  output[13] = 2 * (temp1 * C8);

-  output[ 9] = 2*((step[10] + step[11])*C8);
+  output[ 9] = 2 * ((step[10] + step[11]) * C8);

  intermediate[11] = step[10] - step[11];
  intermediate[12] = step[12] + step[13];
@@ -218,207 +222,300 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[15] = (intermediate[11] + intermediate[12]);
  output[ 1] = -(intermediate[11] - intermediate[12]);

-  output[ 7] = 2*(intermediate[13]*C8);
+  output[ 7] = 2 * (intermediate[13] * C8);

-  temp1 = intermediate[14]*C12;
-  temp2 = intermediate[15]*C4;
+  temp1 = intermediate[14] * C12;
+  temp2 = intermediate[15] * C4;
  temp1 = temp1 - temp2;
-  output[11] = -2*(temp1*C8);
+  output[11] = -2 * (temp1 * C8);

-  temp1 = intermediate[14]*C4;
-  temp2 = intermediate[15]*C12;
+  temp1 = intermediate[14] * C4;
+  temp2 = intermediate[15] * C12;
  temp1 = temp2 + temp1;
-  output[ 5] = 2*(temp1*C8);
+  output[ 5] = 2 * (temp1 * C8);
 }

-static void reference_16x16_dct_1d(double in[16], double out[16]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < 16; k++) {
-    out[k] = 0.0;
-    for (int n = 0; n < 16; n++)
-      out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0);
-    if (k == 0)
-      out[k] = out[k]*kInvSqrt2;
-  }
-}
-
-void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
+void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
  // First transform columns
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = input[j*16 + i];
+      temp_in[j] = input[j * 16 + i];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    for (int j = 0; j < 16; ++j)
-      output[j*16 + i] = temp_out[j];
+      output[j * 16 + i] = temp_out[j];
  }
  // Then transform rows
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = output[j + i*16];
+      temp_in[j] = output[j + i * 16];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    // Scale by some magic number
    for (int j = 0; j < 16; ++j)
-      output[j + i*16] = temp_out[j]/2;
+      output[j + i * 16] = temp_out[j]/2;
  }
 }

-void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-               int stride, int /*tx_type*/) {
+typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(int16_t *in, uint8_t *out, int stride);
+typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
+typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+
+void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
  vp9_short_fdct16x16_c(in, out, stride);
 }
-void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                   int stride, int /*tx_type*/) {
-  vp9_short_idct16x16_add_c(out, dst, stride >> 1);
-}
-void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-              int stride, int tx_type) {
-  // FIXME(jingning): need to test both SSE2 and c
-#if HAVE_SSE2
-  vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
-#else
-  vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
-#endif
-}
-void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-  vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
+
+void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
+  vp9_short_fht16x16_c(in, out, stride, tx_type);
 }

-class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+class Trans16x16TestBase {
 public:
-  virtual ~FwdTrans16x16Test() {}
-
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm = fdct16x16;
-      inv_txfm = idct16x16_add;
-    } else {
-      fwd_txfm = fht16x16;
-      inv_txfm = iht16x16_add;
-    }
-  }
+  virtual ~Trans16x16TestBase() {}

 protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
-  }
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
+  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+      }
+
+      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                      test_temp_block, pitch_));
+      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(1u, max_error)
+        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+
+    EXPECT_GE(count_test_block , total_error)
+        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
  }

+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+    }
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = 255;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -255;
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                      output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+      }
+
+      reference_16x16_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = round(out_r[j]);
+
+      const int pitch = 32;
+      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+  int pitch_;
  int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  fht_t fwd_txfm_ref;
 };

-TEST_P(FwdTrans16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 10000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
+class Trans16x16DCT : public Trans16x16TestBase,
+                      public PARAMS(fdct_t, idct_t, int) {
+ public:
+  virtual ~Trans16x16DCT() {}

-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      // Initialize a test block with input range [-255, 255].
-      test_input_block[j] = src[j] - dst[j];
-    }
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 32;
+    fwd_txfm_ref = fdct16x16_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

-    const int pitch = 32;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride >> 1);
  }

-  EXPECT_GE(1, max_error)
-      << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};

-  EXPECT_GE(count_test_block , total_error)
-      << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
+TEST_P(Trans16x16DCT, AccuracyCheck) {
+  RunAccuracyCheck();
 }

-TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+TEST_P(Trans16x16DCT, CoeffCheck) {
+  RunCoeffCheck();
+}

-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 256; ++j)
-        input_extreme_block[j] = 255;
+TEST_P(Trans16x16DCT, MemCheck) {
+  RunMemCheck();
+}

-    const int pitch = 32;
-    RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
-    RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
+TEST_P(Trans16x16DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}

-    // The minimum quant value is 4.
-    for (int j = 0; j < 256; ++j) {
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 16x16 FDCT extreme has coefficient larger "
-          << "than 4*DCT_MAX_VALUE";
-    }
+class Trans16x16HT : public Trans16x16TestBase,
+                     public PARAMS(fht_t, iht_t, int) {
+ public:
+  virtual ~Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 16;
+    fwd_txfm_ref = fht16x16_ref;
  }
-}
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
-
-TEST(VP9Idct16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[256], coeff[256];
-    uint8_t dst[256], src[256];
-    double out_r[256];
-
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
-      in[j] = src[j] - dst[j];
-
-    reference_16x16_dct_2d(in, out_r);
-    for (int j = 0; j < 256; j++)
-      coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_add_c(coeff, dst, 16);
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error)
-          << "Error: 16x16 IDCT has error " << error
-          << " at index " << j;
-    }
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  fht_t fwd_txfm_;
+  iht_t inv_txfm_;
+};
+
+TEST_P(Trans16x16HT, AccuracyCheck) {
+  RunAccuracyCheck();
 }

+TEST_P(Trans16x16HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans16x16HT, MemCheck) {
+  RunMemCheck();
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_sse2, &vp9_short_idct16x16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
+#endif
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -13,15 +13,17 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"

 extern "C" {
+#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }

-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -30,35 +32,15 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

-static const double kPi = 3.141592653589793238462643383279502884;
-static void reference2_32x32_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 32; ++l) {
-    for (int k = 0; k < 32; ++k) {
-      double s = 0;
-      for (int i = 0; i < 32; ++i) {
-        for (int j = 0; j < 32; ++j) {
-          x = cos(kPi * j * (l + 0.5) / 32.0) *
-              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k * 32 + l] = s / 4;
-    }
-  }
-}
-
-static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
  const double kInvSqrt2 = 0.707106781186547524400844362104;
  for (int k = 0; k < 32; k++) {
    out[k] = 0.0;
@@ -69,7 +51,8 @@ static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
  }
 }

-static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+                            double output[kNumCoeffs]) {
  // First transform columns
  for (int i = 0; i < 32; ++i) {
    double temp_in[32], temp_out[32];
@@ -91,27 +74,165 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
  }
 }

-TEST(VP9Idct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[1024], coeff[1024];
-    uint8_t dst[1024], src[1024];
-    double out_r[1024];
+typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);

-    for (int j = 0; j < 1024; ++j) {
+class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+ public:
+  virtual ~Trans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_  = GET_PARAM(2);  // 0: high precision forward transform
+                               // 1: low precision version for rd loop
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int version_;
+  fwd_txfm_t fwd_txfm_;
+  inv_txfm_t inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  uint32_t max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < kNumCoeffs; ++j) {
      src[j] = rnd.Rand8();
      dst[j] = rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
    }
+
+    const int pitch = 64;
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      const uint32_t diff = dst[j] - src[j];
+      const uint32_t error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  if (version_ == 1) {
+    max_error /= 2;
+    total_error /= 45;
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
+}
+
+TEST_P(Trans32x32Test, CoeffCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    for (int j = 0; j < kNumCoeffs; ++j)
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+
+    if (version_ == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+    } else {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 2000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = 255;
+    if (i == 1)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = -255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (version_ == 0) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+      } else {
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+      }
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than "
+          << "4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    double out_r[kNumCoeffs];
+
+    // Initialize a test block with input range [-255, 255]
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
      in[j] = src[j] - dst[j];
+    }

    reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < 1024; j++)
+    for (int j = 0; j < kNumCoeffs; ++j)
      coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_add_c(coeff, dst, 32);
-    for (int j = 0; j < 1024; ++j) {
+    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    for (int j = 0; j < kNumCoeffs; ++j) {
      const int diff = dst[j] - src[j];
      const int error = diff * diff;
      EXPECT_GE(1, error)
@@ -121,72 +242,21 @@ TEST(VP9Idct32x32Test, AccuracyCheck) {
  }
 }

-TEST(VP9Fdct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  unsigned int max_error = 0;
-  int64_t total_error = 0;
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[1024];
-    int16_t test_temp_block[1024];
-    uint8_t dst[1024], src[1024];
+using std::tr1::make_tuple;

-    for (int j = 0; j < 1024; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = src[j] - dst[j];
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));

-    const int pitch = 64;
-    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
-
-    for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = dst[j] - src[j];
-      const unsigned error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
-}
-
-TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[1024], input_extreme_block[1024];
-    int16_t output_block[1024], output_extreme_block[1024];
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 1024; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_block, pitch);
-    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 1024; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 32x32 FDCT extreme has coefficient larger than "
-             "4*DCT_MAX_VALUE";
-    }
-  }
-}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_sse2,
+                   &vp9_short_idct32x32_add_sse2, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_sse2,
+                   &vp9_short_idct32x32_add_sse2, 1)));
+#endif
 }  // namespace
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -12,7 +12,7 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"

 namespace libvpx_test {
@@ -36,9 +36,8 @@ class DxDataIterator {
 };

 // Provides a simplified interface to manage one video decoding.
-//
-// TODO: similar to Encoder class, the exact services should be
-// added as more tests are added.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/decode_test_driver.h"
@@ -114,19 +114,19 @@ static bool compare_img(const vpx_image_t *img1,
  const unsigned int height_y = img1->d_h;
  unsigned int i;
  for (i = 0; i < height_y; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     width_y) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                    width_y) == 0) && match;
  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
  const unsigned int height_uv = (img1->d_h + 1) >> 1;
  for (i = 0; i <  height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                    width_uv) == 0) && match;
  for (i = 0; i < height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                    width_uv) == 0) && match;
  return match;
 }

@@ -158,7 +158,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
    bool again;
    for (again = true, video->Begin(); again; video->Next()) {
-      again = video->img() != NULL;
+      again = (video->img() != NULL);

      PreEncodeFrameHook(video);
      PreEncodeFrameHook(video, encoder);
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -62,7 +62,7 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    if (droppable_nframes_ > 0 &&
        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
-        if (droppable_frames_[i] == nframes_) {
+        if (droppable_frames_[i] == video->frame()) {
          std::cout << "             Encoding droppable frame: "
                    << droppable_frames_[i] << "\n";
          frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
@@ -148,7 +148,7 @@ TEST_P(ErrorResilienceTest, OnVersusOff) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 25;
+  cfg_.g_lag_in_frames = 10;

  init_flags_ = VPX_CODEC_USE_PSNR;

@@ -179,6 +179,9 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 500;
+  // FIXME(debargha): Fix this to work for any lag.
+  // Currently this test only works for lag = 0
+  cfg_.g_lag_in_frames = 0;

  init_flags_ = VPX_CODEC_USE_PSNR;

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

@@ -136,7 +136,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
  const int count_test_block = 1000000;
  for (int i = 0; i < count_test_block; ++i) {
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
@@ -156,7 +156,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
-        if(test_temp_block[j] > 0) {
+        if (test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,14 +13,16 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 #include "vpx_ports/mem.h"

 extern "C" {
-#include "vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -62,6 +64,7 @@ class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
      inv_txfm = iht8x8_add;
    }
  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

 protected:
  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
@@ -92,8 +95,9 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -121,8 +125,9 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-15, 15].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -148,7 +153,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
 TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
@@ -165,9 +170,11 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    for (int j = 0; j < 64; ++j){
-        if(test_temp_block[j] > 0) {
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    for (int j = 0; j < 64; ++j) {
+        if (test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
@@ -177,7 +184,9 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
          test_temp_block[j] *= 4;
        }
    }
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -199,7 +208,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
 TEST_P(FwdTrans8x8Test, ExtremalCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
@@ -216,8 +225,12 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -11,6 +11,7 @@
 #define TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
+#include <string>

 #include "test/video_source.h"

@@ -34,7 +35,6 @@ class I420VideoSource : public VideoSource {
        height_(0),
        framerate_numerator_(rate_numerator),
        framerate_denominator_(rate_denominator) {
-
    // This initializes raw_sz_, width_, height_ and allocates an img.
    SetSize(width, height);
  }
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -27,10 +27,10 @@ namespace {

 #ifdef _MSC_VER
 static int round(double x) {
-  if(x < 0)
-    return (int)ceil(x - 0.5);
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -16,7 +16,9 @@ extern "C" {
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

-typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
+#include "vpx/vpx_integer.h"
+
+typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
                          int pred_stride, unsigned char *dst_ptr,
                          int dst_stride);
 namespace {
@@ -34,7 +36,7 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
  virtual void TearDown() { libvpx_test::ClearSystemState(); }

  idct_fn_t UUT;
-  short input[16];
+  int16_t input[16];
  unsigned char output[256];
  unsigned char predict[256];
 };
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -15,8 +15,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -106,9 +106,9 @@ class IntraPredBase {
          for (int y = 0; y < block_size_; y++)
            sum += data_ptr_[p][y * stride_ - 1];
        expected = (sum + (1 << (shift - 1))) >> shift;
-      } else
+      } else {
        expected = 0x80;
-
+      }
      // check that all subsequent lines are equal to the first
      for (int y = 1; y < block_size_; ++y)
        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -28,7 +28,7 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 // so that we can do actual file decodes.
 class IVFVideoSource : public CompressedVideoSource {
 public:
-  IVFVideoSource(const std::string &file_name)
+  explicit IVFVideoSource(const std::string &file_name)
      : file_name_(file_name),
        input_file_(NULL),
        compressed_frame_buf_(NULL),
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -132,7 +132,6 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
  // Verify that keyframes match the file keyframes in the file.
  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
       iter != kf_pts_list_.end(); ++iter) {
-
    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
        << *iter;
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_MD5_HELPER_H_
-#define LIBVPX_TEST_MD5_HELPER_H_
+#ifndef TEST_MD5_HELPER_H_
+#define TEST_MD5_HELPER_H_

 extern "C" {
 #include "./md5_utils.h"
@@ -25,9 +25,15 @@ class MD5 {

  void Add(const vpx_image_t *img) {
    for (int plane = 0; plane < 3; ++plane) {
-      uint8_t *buf = img->planes[plane];
-      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
-      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
+                    img->y_chroma_shift : img->d_h;
+      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
+                    img->x_chroma_shift : img->d_w;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
@@ -61,4 +67,4 @@ class MD5 {

 }  // namespace libvpx_test

-#endif  // LIBVPX_TEST_MD5_HELPER_H_
+#endif  // TEST_MD5_HELPER_H_
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -11,8 +11,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -63,7 +63,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
  // Pointers to top-left pixel of block in the input and output images.
  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
  uint8_t *const dst_image_ptr = dst_image + 8;
-  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
  (void)vpx_memset(flimits, 255, block_width);

  // Initialize pixels in the input:
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
-#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#ifndef TEST_REGISTER_STATE_CHECK_H_
+#define TEST_REGISTER_STATE_CHECK_H_

 #ifdef _WIN64

@@ -92,4 +92,4 @@ class RegisterStateCheck {};

 #endif  // _WIN64

-#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#endif  // TEST_REGISTER_STATE_CHECK_H_
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -16,8 +16,68 @@
 #include "test/video_source.h"
 #include "test/util.h"

+// Enable(1) or Disable(0) writing of the compressed bitstream.
+#define WRITE_COMPRESSED_STREAM 0
+
 namespace {

+#if WRITE_COMPRESSED_STREAM
+static void mem_put_le16(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+  mem[2] = val >> 16;
+  mem[3] = val >> 24;
+}
+
+static void write_ivf_file_header(const vpx_codec_enc_cfg_t *const cfg,
+                                  int frame_cnt, FILE *const outfile) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4,  0);                   /* version */
+  mem_put_le16(header + 6,  32);                  /* headersize */
+  mem_put_le32(header + 8,  0x30395056);          /* fourcc (vp9) */
+  mem_put_le16(header + 12, cfg->g_w);            /* width */
+  mem_put_le16(header + 14, cfg->g_h);            /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);           /* length */
+  mem_put_le32(header + 28, 0);                   /* unused */
+
+  (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+  char header[4];
+  mem_put_le32(header, static_cast<unsigned int>(size));
+  (void)fwrite(header, 1, 4, outfile);
+}
+
+static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
+                                   FILE *const outfile) {
+  char header[12];
+  vpx_codec_pts_t pts;
+
+  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+    return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void)fwrite(header, 1, 12, outfile);
+}
+#endif  // WRITE_COMPRESSED_STREAM
+
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;

@@ -42,6 +102,8 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
    limit_ = 60;
  }

+  virtual ~ResizingVideoSource() {}
+
 protected:
  virtual void Next() {
    ++frame_;
@@ -56,13 +118,15 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 protected:
  ResizeTest() : EncoderTest(GET_PARAM(0)) {}

+  virtual ~ResizeTest() {}
+
  struct FrameInfo {
    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
        : pts(_pts), w(_w), h(_h) {}

    vpx_codec_pts_t pts;
-    unsigned int    w;
-    unsigned int    h;
+    unsigned int w;
+    unsigned int h;
  };

  virtual void SetUp() {
@@ -95,17 +159,47 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
  }
 }

+const unsigned int kStepDownFrame = 3;
+const unsigned int kStepUpFrame = 6;
+
 class ResizeInternalTest : public ResizeTest {
 protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeInternalTest()
+      : ResizeTest(),
+        frame0_psnr_(0.0),
+        outfile_(NULL),
+        out_frames_(0) {}
+#else
  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeInternalTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }

  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == 3) {
+    if (video->frame() == kStepDownFrame) {
      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
-    if (video->frame() == 6) {
+    if (video->frame() == kStepUpFrame) {
      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
@@ -117,21 +211,46 @@ class ResizeInternalTest : public ResizeTest {
    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);
  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+#if WRITE_COMPRESSED_STREAM
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+#endif
+  }
+
  double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
 };

 TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 10);
  init_flags_ = VPX_CODEC_USE_PSNR;
+
  // q picked such that initial keyframe on this clip is ~30dB PSNR
  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+
+  // If the number of frames being encoded is smaller than g_lag_in_frames
+  // the encoded frame is unavailable using the current API. Comparing
+  // frames to detect mismatch would then not be possible. Set
+  // g_lag_in_frames = 0 to get around this.
+  cfg_.g_lag_in_frames = 0;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

  for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    const vpx_codec_pts_t pts = info->pts;
-    if (pts >= 3 && pts < 6) {
+    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
    } else {
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -17,7 +17,6 @@ extern "C" {
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
-//#include "vp8/common/blockd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -17,15 +17,19 @@
 #include <sys/types.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 extern "C" {
 #include "vp8/encoder/onyx_int.h"
 }

+using libvpx_test::ACMRandom;
+
 namespace {

 TEST(Vp8RoiMapTest, ParameterCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
@@ -121,10 +125,10 @@ TEST(Vp8RoiMapTest, ParameterCheck) {
    for (int i = 0; i < 1000; ++i) {
      int rand_deltas[4];
      int deltas_valid;
-      rand_deltas[0] = (rand() % 160) - 80;
-      rand_deltas[1] = (rand() % 160) - 80;
-      rand_deltas[2] = (rand() % 160) - 80;
-      rand_deltas[3] = (rand() % 160) - 80;
+      rand_deltas[0] = rnd(160) - 80;
+      rand_deltas[1] = rnd(160) - 80;
+      rand_deltas[2] = rnd(160) - 80;
+      rand_deltas[3] = rnd(160) - 80;

      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
                      (abs(rand_deltas[1]) <= 63) &&
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -13,8 +13,8 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
@@ -51,7 +51,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
  bd.predictor = reinterpret_cast<unsigned char*>(
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));

-  for(int i = 0; kSrcStride[i] > 0; ++i) {
+  for (int i = 0; kSrcStride[i] > 0; ++i) {
    // start at block0
    be.src = 0;
    be.base_src = &source;
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -520,3 +520,12 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
+4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba  vp90-2-05-resize.ivf
+7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
+bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
+f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
+495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
+65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
+
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,7 +24,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc

@@ -629,3 +629,11 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string>
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
 #endif

 #if !CONFIG_SHARED
-  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
+// Shared library builds don't support whitebox tests
+// that exercise internal symbols.
+
 #if CONFIG_VP8
  vp8_rtcd();
 #endif
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,7 +159,11 @@ const char *kVP9TestVectors[] = {
  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm"
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
+  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
+#if CONFIG_NON420
+  "vp91-2-04-yv444.webm"
+#endif
 };
 #endif

--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -16,16 +16,16 @@
 #include "test/register_state_check.h"

 #include "vpx/vpx_integer.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
-# include "vp8_rtcd.h"
+# include "./vp8_rtcd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 # include "vp9/encoder/vp9_variance.h"
-# include "vp9_rtcd.h"
+# include "./vp9_rtcd.h"
 #endif
 }
 #include "test/acm_random.h"
@@ -107,8 +107,8 @@ static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
 }

 template<typename VarianceFunctionType>
-class VarianceTest :
-    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+class VarianceTest
+    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
@@ -191,9 +191,9 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
 }

 template<typename SubpelVarianceFunctionType>
-class SubpelVarianceTest :
-    public ::testing::TestWithParam<tuple<int, int,
-                                          SubpelVarianceFunctionType> > {
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            SubpelVarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, SubpelVarianceFunctionType>& params =
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -8,10 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
-}

 #include <math.h>
 #include <stddef.h>
@@ -24,6 +20,11 @@ extern "C" {
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"

+extern "C" {
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
+}
+
 namespace {
 const int num_tests = 10;

@@ -44,7 +45,7 @@ void encrypt_buffer(uint8_t *buffer, int size) {

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                           uint8_t *output, int count) {
-  int offset = input - (uint8_t *)decrypt_state;
+  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
  for (int i = 0; i < count; i++) {
    output[i] = input[i] ^ secret_key[(offset + i) & 15];
  }
@@ -58,10 +59,10 @@ TEST(VP8, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];

-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
        const int parity = i & 1;
        probas[i] =
            (method == 0) ? 0 : (method == 1) ? 255 :
@@ -76,14 +77,14 @@ TEST(VP8, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
        ACMRandom bit_rnd(random_seed);
        BOOL_CODER bw;
-        uint8_t bw_buffer[buffer_size];
-        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);
+        uint8_t bw_buffer[kBufferSize];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -98,19 +99,20 @@ TEST(VP8, TestBitIO) {
 #if CONFIG_DECRYPT
        encrypt_buffer(bw_buffer, buffer_size);
        vp8dx_start_decode(&br, bw_buffer, buffer_size,
-                           test_decrypt_cb, (void *)bw_buffer);
+                           test_decrypt_cb,
+                           reinterpret_cast<void *>(bw_buffer));
 #else
-        vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL);
+        vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL);
 #endif
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: "<< i << " / " << bits_to_test
+              << "pos: "<< i << " / " << kBitsToTest
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -26,7 +26,8 @@ const uint8_t test_key[16] = {
  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };

-void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst,
+                    int size, int offset = 0) {
  for (int i = 0; i < size; ++i) {
    dst[i] = src[i] ^ test_key[(offset + i) & 15];
  }
@@ -34,10 +35,11 @@ void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0)

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                     uint8_t *output, int count) {
-  encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state);
+  encrypt_buffer(input, output, count,
+                 input - reinterpret_cast<uint8_t *>(decrypt_state));
 }

-} // namespace
+}  // namespace

 namespace libvpx_test {

--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -18,7 +18,7 @@


 extern "C" {
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 }

 #include "test/acm_random.h"
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -19,7 +19,7 @@ extern "C" {
 #include "vp9/decoder/vp9_dboolhuff.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,10 +32,10 @@ TEST(VP9, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];

-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
        const int parity = i & 1;
        probas[i] =
          (method == 0) ? 0 : (method == 1) ? 255 :
@@ -50,14 +50,14 @@ TEST(VP9, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
        ACMRandom bit_rnd(random_seed);
        vp9_writer bw;
-        uint8_t bw_buffer[buffer_size];
+        uint8_t bw_buffer[kBufferSize];
        vp9_start_encode(&bw, bw_buffer);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -72,16 +72,16 @@ TEST(VP9, TestBitIO) {
        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);

        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, buffer_size);
+        vp9_reader_init(&br, bw_buffer, kBufferSize);
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
-              << "pos: " << i << " / " << bits_to_test
+              << "pos: " << i << " / " << kBitsToTest
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -39,8 +39,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZE_TYPES;
-       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
    const int block_width  = 4 << b_width_log2(bsize);
    const int block_height = 4 << b_height_log2(bsize);
    int16_t *diff = reinterpret_cast<int16_t *>(
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -41,7 +41,8 @@ extern "C"
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-        USAGE_CONSTRAINED_QUALITY   = 0x2
+        USAGE_CONSTRAINED_QUALITY   = 0x2,
+        USAGE_CONSTANT_QUALITY      = 0x3
    } END_USAGE;


--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -313,7 +313,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    /* Get baseline error score */

    /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+    vpx_yv12_copy_y(saved_frame, cm->frame_to_show);

    vp8cx_set_alt_lf_level(cpi, filt_mid);
    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
@@ -339,7 +339,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
            if(ss_err[filt_low] == 0)
            {
                /* Get Low filter error score */
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_low);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

@@ -367,7 +367,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
        {
            if(ss_err[filt_high] == 0)
            {
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_high);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -153,7 +153,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -204,7 +204,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-    if(finalize && cfg->rc_end_usage == VPX_CQ)
+    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
        RANGE_CHECK(vp8_cfg, cq_level,
                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);

@@ -327,17 +327,14 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
    oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;

-    if (cfg.rc_end_usage == VPX_VBR)
-    {
-        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
-    }
-    else if (cfg.rc_end_usage == VPX_CBR)
-    {
-        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
-    }
-    else if (cfg.rc_end_usage == VPX_CQ)
-    {
-        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    if (cfg.rc_end_usage == VPX_VBR) {
+      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    } else if (cfg.rc_end_usage == VPX_CBR) {
+      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    } else if (cfg.rc_end_usage == VPX_CQ) {
+      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    } else if (cfg.rc_end_usage == VPX_Q) {
+      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
    }

    oxcf->target_bandwidth         = cfg.rc_target_bitrate;
@@ -1272,7 +1269,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        1,                  /* g_delete_first_pass_file */
        "vp8.fpf"           /* first pass filename */
 #endif
-
+        VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
        1,                  /* ts_number_layers */
        {0},                /* ts_target_bitrate */
        {0},                /* ts_rate_decimator */
--- a/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_avg_neon.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_avg_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_avg_neon| PROC
+    push                {r4-r6, lr}
+    ldrd                r4, r5, [sp, #32]
+    mov                 r6, r2
+
+    cmp                 r4, #32
+    bgt                 avg64
+    beq                 avg32
+    cmp                 r4, #8
+    bgt                 avg16
+    beq                 avg8
+    b                   avg4
+
+avg64
+    sub                 lr, r1, #32
+    sub                 r4, r3, #32
+avg64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    pld                 [r2, r3]
+    vld1.8              {q8-q9},   [r6@128]!
+    vld1.8              {q10-q11}, [r6@128], r4
+    vrhadd.u8           q0, q0, q8
+    vrhadd.u8           q1, q1, q9
+    vrhadd.u8           q2, q2, q10
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r4
+    subs                r5, r5, #1
+    bgt                 avg64_h
+    pop                 {r4-r6, pc}
+
+avg32
+    vld1.8              {q0-q1}, [r0], r1
+    vld1.8              {q2-q3}, [r0], r1
+    vld1.8              {q8-q9},   [r6@128], r3
+    vld1.8              {q10-q11}, [r6@128], r3
+    pld                 [r0]
+    vrhadd.u8           q0, q0, q8
+    pld                 [r0, r1]
+    vrhadd.u8           q1, q1, q9
+    pld                 [r6]
+    vrhadd.u8           q2, q2, q10
+    pld                 [r6, r3]
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg32
+    pop                 {r4-r6, pc}
+
+avg16
+    vld1.8              {q0}, [r0], r1
+    vld1.8              {q1}, [r0], r1
+    vld1.8              {q2}, [r6@128], r3
+    vld1.8              {q3}, [r6@128], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q2
+    pld                 [r6]
+    pld                 [r6, r3]
+    vrhadd.u8           q1, q1, q3
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg16
+    pop                 {r4-r6, pc}
+
+avg8
+    vld1.8              {d0}, [r0], r1
+    vld1.8              {d1}, [r0], r1
+    vld1.8              {d2}, [r6@64], r3
+    vld1.8              {d3}, [r6@64], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q1
+    pld                 [r6]
+    pld                 [r6, r3]
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d1}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 avg8
+    pop                 {r4-r6, pc}
+
+avg4
+    vld1.32             {d0[0]}, [r0], r1
+    vld1.32             {d0[1]}, [r0], r1
+    vld1.32             {d2[0]}, [r6@32], r3
+    vld1.32             {d2[1]}, [r6@32], r3
+    vrhadd.u8           d0, d0, d2
+    vst1.32             {d0[0]}, [r2@32], r3
+    vst1.32             {d0[1]}, [r2@32], r3
+    subs                r5, r5, #2
+    bgt                 avg4
+    pop                 {r4-r6, pc}
+    ENDP
+
+    END
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -66,46 +66,64 @@

    vld1.s16        {q0}, [r5]              ; filter_x

-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4

-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4

-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8

    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

-    ; extract to s16
+    pld             [r0, r1, lsl #2]
+
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
    vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]

    ; slightly out of order load to match the existing data
    vld1.u32        {d6[0]}, [r2], r3
@@ -116,10 +134,12 @@ loop_horiz
    sub             r2, r2, r3, lsl #2      ; reset for store

    ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -135,24 +155,29 @@ loop_horiz
    vtrn.16         d2, d3
    vtrn.32         d2, d3
    vtrn.8          d2, d3
-    
+
    ; average the new value and the dst value
    vrhadd.u8       q1, q1, q3

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13

    subs            r6, r6, #4              ; w -= 4
    bgt             loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
+    bgt loop_horiz_v

    pop             {r4-r10, pc}

@@ -163,66 +188,77 @@ loop_horiz
    cmp             r12, #16
    bne             vp9_convolve8_avg_vert_c

-    push            {r4-r10, lr}
+    push            {r4-r8, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h

-    vld1.s16        {q0}, [r7]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter_y

-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1

-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter

-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1

-    mov             r10, r8                 ; w loop counter
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
-
-    ; extract to s16
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
+    vld1.u32        {d6[0]}, [r5@32], r3
+    vld1.u32        {d6[1]}, [r8@32], r3
+    vld1.u32        {d7[0]}, [r5@32], r3
+    vld1.u32        {d7[1]}, [r8@32], r3

-    sub             r2, r2, r3, lsl #2      ; reset for store
+    pld             [r7]
+    pld             [r4]

    ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -237,22 +273,30 @@ loop_vert
    ; average the new value and the dst value
    vrhadd.u8       q1, q1, q3

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
+    sub             r5, r5, r3, lsl #1      ; reset for store
+    sub             r8, r8, r3, lsl #1

-    subs            r8, r8, #4              ; w -= 4
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
    bgt             loop_vert

    ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h

-    pop             {r4-r10, pc}
+    pop             {r4-r8, pc}

    ENDP
    END
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -66,52 +66,72 @@

    vld1.s16        {q0}, [r5]              ; filter_x

-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4

-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4

-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8

    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

-    ; extract to s16
+    pld             [r0, r1, lsl #2]
+
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
    vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]

    ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -128,20 +148,25 @@ loop_horiz
    vtrn.32         d2, d3
    vtrn.8          d2, d3

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13

    subs            r6, r6, #4              ; w -= 4
    bgt             loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
+    bgt loop_horiz_v

    pop             {r4-r10, pc}

@@ -152,59 +177,72 @@ loop_horiz
    cmp             r12, #16
    bne             vp9_convolve8_vert_c

-    push            {r4-r10, lr}
+    push            {r4-r8, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h

-    vld1.s16        {q0}, [r7]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter_y

-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1

-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter

-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1

-    mov             r10, r8                 ; w loop counter
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
-
-    ; extract to s16
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

+    pld             [r5]
+    pld             [r8]
+
    ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r7]
+    pld             [r4]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -216,22 +254,27 @@ loop_vert
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3

-    subs            r8, r8, #4              ; w -= 4
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
    bgt             loop_vert

    ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h

-    pop             {r4-r10, pc}
+    pop             {r4-r8, pc}

    ENDP
    END
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -10,6 +10,7 @@

 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vpx_ports/mem.h"

 void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
                        uint8_t *dst, ptrdiff_t dst_stride,
@@ -19,7 +20,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
   */
-  uint8_t temp[64 * 72];
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);

  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;
@@ -53,7 +54,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
-  uint8_t temp[64 * 72];
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
  int intermediate_height = h + 7;

  if (x_step_q4 != 16 || y_step_q4 != 16)
--- a/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/vp9/common/arm/neon/vp9_copy_neon.asm
@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_copy_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_copy_neon| PROC
+    push                {r4-r5, lr}
+    ldrd                r4, r5, [sp, #28]
+
+    cmp                 r4, #32
+    bgt                 copy64
+    beq                 copy32
+    cmp                 r4, #8
+    bgt                 copy16
+    beq                 copy8
+    b                   copy4
+
+copy64
+    sub                 lr, r1, #32
+    sub                 r3, r3, #32
+copy64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #1
+    bgt                 copy64_h
+    pop                 {r4-r5, pc}
+
+copy32
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q2-q3}, [r0], r1
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy32
+    pop                 {r4-r5, pc}
+
+copy16
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q1}, [r0], r1
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy16
+    pop                 {r4-r5, pc}
+
+copy8
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d2}, [r0], r1
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d2}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 copy8
+    pop                 {r4-r5, pc}
+
+copy4
+    ldr                 r12, [r0], r1
+    str                 r12, [r2], r3
+    subs                r5, r5, #1
+    bgt                 copy4
+    pop                 {r4-r5, pc}
+    ENDP
+
+    END
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+
+void vp9_short_idct16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  save_neon_registers();
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  restore_neon_registers();
+
+  return;
+}
+
+void vp9_short_idct10_16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  save_neon_registers();
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct10_16x16_add_neon_pass2(input+1,
+                                        row_idct_output,
+                                        pass1_output,
+                                        0,
+                                        dest,
+                                        dest_stride);
+
+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  restore_neon_registers();
+
+  return;
+}
--- a/vp9/common/arm/neon/vp9_idct32x32_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_neon.c
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+
+// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
+                                           int16_t *output, int16_t *input);
+extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+
+
+// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  // TODO(cd): move the creation of these buffers within the ASM file
+  // internal buffer used to transpose 8 lines into before transforming them
+  int16_t transpose_buffer[32 * 8];
+  // results of the first pass (transpose and transform rows)
+  int16_t pass1[32 * 32];
+  // results of the second pass (transpose and transform columns)
+  int16_t pass2[32 * 32];
+
+  // save register we need to preserve
+  save_neon_registers();
+  // process rows
+  idct32_transpose_and_transform(transpose_buffer, pass1, input);
+  // process columns
+  // TODO(cd): do these two steps/passes within the ASM file
+  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
+  // combine and add to dest
+  // TODO(cd): integrate this within the last storage step of the second pass
+  idct32_combine_add(dest, pass2, dest_stride);
+  // restore register we need to preserve
+  restore_neon_registers();
+}
+
+// TODO(cd): Eliminate this file altogether when everything is in ASM file
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -361,8 +361,6 @@ v_end

    vand        d16, d20, d19              ; flat && mask
    vmov        r5, r6, d16
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch

    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
@@ -388,10 +386,11 @@ v_end

    vmov.u8     d22, #0x80

+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
    vand        d17, d18, d16              ; flat2 && flat && mask
    vmov        r5, r6, d17
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch

    ; mbfilter() function

@@ -405,15 +404,10 @@ v_end
    vmov.u8     d27, #3

    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-
    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
    vand        d29, d29, d21              ; filter &= hev
-
    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
    vmov.u8     d29, #4

    ; filter = clamp(filter + 3 * ( qs0 - ps0))
@@ -452,37 +446,37 @@ v_end
    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddl.u8    q10, d4, d5
    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vaddl.u8    q14, d6, d9
    vqrshrn.u16 d18, q15, #3               ; r_op2

-    vsubw.u8    q15, d4                    ; op1 = op2 - p3
-    vsubw.u8    q15, d5                    ; op1 -= p2
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d9                    ; op1 += q1
+    vsub.i16    q15, q10
+    vaddl.u8    q10, d4, d6
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d7, d10
    vqrshrn.u16 d19, q15, #3               ; r_op1

-    vsubw.u8    q15, d4                    ; op0 = op1 - p3
-    vsubw.u8    q15, d6                    ; op0 -= p1
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d10                   ; op0 += q2
+    vsub.i16    q15, q10
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d8, d11
    vqrshrn.u16 d20, q15, #3               ; r_op0

    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d11                   ; oq0 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d9, d11
    vqrshrn.u16 d21, q15, #3               ; r_oq0

    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddw.u8    q15, d11                   ; oq1 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d10, d11
    vqrshrn.u16 d22, q15, #3               ; r_oq1

    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d11                   ; oq2 += q3
+    vadd.i16    q15, q14
    vqrshrn.u16 d27, q15, #3               ; r_oq2

    ; Filter does not set op2 or oq2, so use p2 and q2.
@@ -501,113 +495,104 @@ v_end
    ; wide_mbfilter flat2 && flat && mask branch
    vmov.u8     d16, #7
    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vaddl.u8    q12, d2, d3
+    vaddl.u8    q13, d4, d5
+    vaddl.u8    q14, d1, d6
    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vmlal.u8    q15, d1, d29               ; op6 += p6 * 2
-    vaddw.u8    q15, d2                    ; op6 += p5
-    vaddw.u8    q15, d3                    ; op6 += p4
-    vaddw.u8    q15, d4                    ; op6 += p3
-    vaddw.u8    q15, d5                    ; op6 += p2
-    vaddw.u8    q15, d6                    ; op6 += p1
+    vadd.i16    q12, q13
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vadd.i16    q15, q12
+    vaddl.u8    q12, d0, d1
+    vaddw.u8    q15, d1
+    vaddl.u8    q13, d0, d2
+    vadd.i16    q14, q15, q14
    vqrshrn.u16 d16, q15, #4               ; w_op6

-    vsubw.u8    q15, d0                    ; op5 = op6 - p7
-    vsubw.u8    q15, d1                    ; op5 -= p6
-    vaddw.u8    q15, d2                    ; op5 += p5
-    vaddw.u8    q15, d9                    ; op5 += q1
+    vsub.i16    q15, q14, q12
+    vaddl.u8    q14, d3, d10
    vqrshrn.u16 d24, q15, #4               ; w_op5

-    vsubw.u8    q15, d0                    ; op4 = op5 - p7
-    vsubw.u8    q15, d2                    ; op4 -= p5
-    vaddw.u8    q15, d3                    ; op4 += p4
-    vaddw.u8    q15, d10                   ; op4 += q2
+    vsub.i16    q15, q13
+    vaddl.u8    q13, d0, d3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d4, d11
    vqrshrn.u16 d25, q15, #4               ; w_op4

-    vsubw.u8    q15, d0                    ; op3 = op4 - p7
-    vsubw.u8    q15, d3                    ; op3 -= p4
-    vaddw.u8    q15, d4                    ; op3 += p3
-    vaddw.u8    q15, d11                   ; op3 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d0, d4
+    vsub.i16    q15, q13
+    vsub.i16    q14, q15, q14
    vqrshrn.u16 d26, q15, #4               ; w_op3

-    vsubw.u8    q15, d0                    ; op2 = op3 - p7
-    vsubw.u8    q15, d4                    ; op2 -= p3
-    vaddw.u8    q15, d5                    ; op2 += p2
+    vaddw.u8    q15, q14, d5               ; op2 += p2
+    vaddl.u8    q14, d0, d5
    vaddw.u8    q15, d12                   ; op2 += q4
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
    vqrshrn.u16 d27, q15, #4               ; w_op2

-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op1 = op2 - p7
-    vsubw.u8    q15, d5                    ; op1 -= p2
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d6
    vaddw.u8    q15, d6                    ; op1 += p1
    vaddw.u8    q15, d13                   ; op1 += q5
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
    vqrshrn.u16 d18, q15, #4               ; w_op1

-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op0 = op1 - p7
-    vsubw.u8    q15, d6                    ; op0 -= p1
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d7
    vaddw.u8    q15, d7                    ; op0 += p0
    vaddw.u8    q15, d14                   ; op0 += q6
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
    vqrshrn.u16 d19, q15, #4               ; w_op0

-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; oq0 = op0 - p7
-    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d1, d8
    vaddw.u8    q15, d8                    ; oq0 += q0
    vaddw.u8    q15, d15                   ; oq0 += q7
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
    vqrshrn.u16 d20, q15, #4               ; w_oq0

-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d1                    ; oq1 = oq0 - p6
-    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d2, d9
    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddl.u8    q4, d10, d15
    vaddw.u8    q15, d15                   ; oq1 += q7
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
    vqrshrn.u16 d21, q15, #4               ; w_oq1

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d3, d10
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d11, d15
    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d2                    ; oq2 = oq1 - p5
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d15                   ; oq2 += q7
    vqrshrn.u16 d22, q15, #4               ; w_oq2

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d12, d15
    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d3                    ; oq3 = oq2 - p4
-    vsubw.u8    q15, d10                   ; oq3 -= q2
-    vaddw.u8    q15, d11                   ; oq3 += q3
-    vaddw.u8    q15, d15                   ; oq3 += q7
    vqrshrn.u16 d23, q15, #4               ; w_oq3

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d5, d12
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d13, d15
    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d4                    ; oq4 = oq3 - p3
-    vsubw.u8    q15, d11                   ; oq4 -= q3
-    vaddw.u8    q15, d12                   ; oq4 += q4
-    vaddw.u8    q15, d15                   ; oq4 += q7
    vqrshrn.u16 d1, q15, #4                ; w_oq4

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d6, d13
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d14, d15
    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d5                    ; oq5 = oq4 - p2
-    vsubw.u8    q15, d12                   ; oq5 -= q4
-    vaddw.u8    q15, d13                   ; oq5 += q5
-    vaddw.u8    q15, d15                   ; oq5 += q7
    vqrshrn.u16 d2, q15, #4                ; w_oq5

+    vsub.i16    q15, q14
    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d6                    ; oq6 = oq5 - p1
-    vsubw.u8    q15, d13                   ; oq6 -= q5
-    vaddw.u8    q15, d14                   ; oq6 += q6
-    vaddw.u8    q15, d15                   ; oq6 += q7
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vadd.i16    q15, q4
    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct16x16_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct4x4_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    vld1.32          {d2[0]}, [r1], r2
+    vld1.32          {d2[1]}, [r1], r2
+    vld1.32          {d4[0]}, [r1], r2
+    vld1.32          {d4[1]}, [r1]
+
+    vaddw.u8         q8, q0, d2                ; dest[x] + a1
+    vaddw.u8         q9, q0, d4
+
+    vqmovun.s16      d6, q8                    ; clip_pixel
+    vqmovun.s16      d7, q9
+
+    vst1.32          {d6[0]}, [r12], r2
+    vst1.32          {d6[1]}, [r12], r2
+    vst1.32          {d7[0]}, [r12], r2
+    vst1.32          {d7[1]}, [r12]
+
+    bx               lr
+    ENDP             ; |vp9_short_idct4x4_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_add_neon| PROC
+
+    ; The 2D transform is done with two passes which are actually pretty
+    ; similar. We first transform the rows. This is done by transposing
+    ; the inputs, doing an SIMD column transform (the columns are the
+    ; transposed rows) and then transpose the results (so that it goes back
+    ; in normal/row positions). Then, we transform the columns by doing
+    ; another SIMD column transform.
+    ; So, two passes of a transpose followed by a column transform.
+
+    ; load the inputs into q8-q9, d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+
+    ; generate scalar constants
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov             r0, #0x3b00
+    add             r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov             r3, #0x2d00
+    add             r3, #0x41
+    ; cospi_24_64 = 6270 = 0x 187e
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; transpose the input data
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+
+    ; generate constant vectors
+    vdup.16         d20, r0         ; replicate cospi_8_64
+    vdup.16         d21, r3         ; replicate cospi_16_64
+
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    vdup.16         d22, r12        ; replicate cospi_24_64
+
+    ; do the transform on transposed rows
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+    vswp     d18, d19
+
+    ; transpose the results
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    ; do the transform on columns
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+
+    ; The results are in two registers, one of them being swapped. This will
+    ; be taken care of by loading the 'dest' value in a swapped fashion and
+    ; also storing them in the same swapped fashion.
+    ; temp_out[0, 1] = d16, d17 = q8
+    ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16 q8, q8, #4
+    vrshr.s16 q9, q9, #4
+
+    vld1.32 {d26[0]}, [r1], r2
+    vld1.32 {d26[1]}, [r1], r2
+    vld1.32 {d27[1]}, [r1], r2
+    vld1.32 {d27[0]}, [r1]  ; no post-increment
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8 q8, q8, d26
+    vaddw.u8 q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb r2, r2, #0
+    vst1.32 {d27[0]}, [r1], r2
+    vst1.32 {d27[1]}, [r1], r2
+    vst1.32 {d26[1]}, [r1], r2
+    vst1.32 {d26[0]}, [r1]  ; no post-increment
+    bx              lr
+    ENDP  ; |vp9_short_idct4x4_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct8x8_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -9,6 +9,7 @@
 ;

    EXPORT  |vp9_short_idct8x8_add_neon|
+    EXPORT  |vp9_short_idct10_8x8_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -24,191 +25,149 @@
    ; stage 1
    vdup.16         d0, r3                    ; duplicate cospi_28_64
    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64

    ; input[1] * cospi_28_64
    vmull.s16       q2, d18, d0
    vmull.s16       q3, d19, d0

-    ; input[7] * cospi_4_64
-    vmull.s16       q4, d30, d1
-    vmull.s16       q5, d31, d1
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2

    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vsub.s32        q6, q2, q4
-    vsub.s32        q7, q3, q5
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3

    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q6, #14               ; >> 14
-    vqrshrn.s32     d9, q7, #14               ; >> 14
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14

    ; input[1] * cospi_4_64
    vmull.s16       q2, d18, d1
    vmull.s16       q3, d19, d1

-    ; input[7] * cospi_28_64
-    vmull.s16       q1, d30, d0
-    vmull.s16       q5, d31, d0
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3

    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vadd.s32        q2, q2, q1
-    vadd.s32        q3, q3, q5
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d14, q2, #14              ; >> 14
    vqrshrn.s32     d15, q3, #14              ; >> 14

-    vdup.16         d0, r5                    ; duplicate cospi_12_64
-    vdup.16         d1, r6                    ; duplicate cospi_20_64
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q2, d26, d0
-    vmull.s16       q3, d27, d0
-
-    ; input[3] * cospi_20_64
-    vmull.s16       q5, d22, d1
-    vmull.s16       q6, d23, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vsub.s32        q2, q2, q5
-    vsub.s32        q3, q3, q6
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q2, d26, d1
-    vmull.s16       q3, d27, d1
-
-    ; input[3] * cospi_12_64
-    vmull.s16       q9, d22, d0
-    vmull.s16       q15, d23, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vadd.s32        q0, q2, q9
-    vadd.s32        q1, q3, q15
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q0, #14              ; >> 14
-    vqrshrn.s32     d13, q1, #14              ; >> 14
-
    ; stage 2 & stage 3 - even half
    vdup.16         d0, r7                    ; duplicate cospi_16_64

+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
    ; input[0] * cospi_16_64
    vmull.s16       q2, d16, d0
    vmull.s16       q3, d17, d0

-    ; input[2] * cospi_16_64
-    vmull.s16       q9,  d24, d0
-    vmull.s16       q11, d25, d0
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0

    ; (input[0] + input[2]) * cospi_16_64
-    vadd.s32        q9, q2, q9
-    vadd.s32        q11, q3, q11
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q11, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[2] * cospi_16_64
-    vmull.s16       q13,  d24, d0
-    vmull.s16       q15, d25, d0
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0

    ; (input[0] - input[2]) * cospi_16_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0

-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q2, #14              ; >> 14
-    vqrshrn.s32     d23, q3, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
    vdup.16         d0, r8                    ; duplicate cospi_24_64
    vdup.16         d1, r9                    ; duplicate cospi_8_64

+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
    ; input[1] * cospi_24_64
    vmull.s16       q2, d20, d0
    vmull.s16       q3, d21, d0

-    ; input[3] * cospi_8_64
-    vmull.s16       q13, d28, d1
-    vmull.s16       q15, d29, d1
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1

    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d26, q2, #14              ; >> 14
    vqrshrn.s32     d27, q3, #14              ; >> 14

-    ; input[1] * cospi_8_64
-    vmull.s16       q2, d20, d1
-    vmull.s16       q3, d21, d1
-
-    ; input[3] * cospi_24_64
-    vmull.s16       q8, d28, d0
-    vmull.s16       q10, d29, d0
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vadd.s32        q0, q2, q8
-    vadd.s32        q1, q3, q10
-
    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q0, #14              ; >> 14
-    vqrshrn.s32     d31, q1, #14              ; >> 14
-
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14

    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]

+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
    ; stage 2 - odd half
    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]

-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
    ; step2[6] * cospi_16_64
    vmull.s16       q9, d28, d16
    vmull.s16       q10, d29, d16

-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16

    ; (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q9, q9, q11
-    vsub.s32        q10, q10, q12
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d10, q9, #14              ; >> 14
    vqrshrn.s32     d11, q10, #14             ; >> 14

-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14

    ; stage 4
    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
@@ -247,14 +206,11 @@

 |vp9_short_idct8x8_add_neon| PROC
    push            {r4-r9}
-    vld1.s16        {q8}, [r0]!
-    vld1.s16        {q9}, [r0]!
-    vld1.s16        {q10}, [r0]!
-    vld1.s16        {q11}, [r0]!
-    vld1.s16        {q12}, [r0]!
-    vld1.s16        {q13}, [r0]!
-    vld1.s16        {q14}, [r0]!
-    vld1.s16        {q15}, [r0]!
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!

    ; transpose the input data
    TRANSPOSE8X8
@@ -349,8 +305,215 @@
    vst1.64         {d6}, [r0], r2
    vst1.64         {d7}, [r0], r2

+    vpop            {d8-d15}
    pop             {r4-r9}
    bx              lr
    ENDP  ; |vp9_short_idct8x8_add_neon|

+;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct10_8x8_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    ; stage 1
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+    ; to double the constants before multiplying to compensate this.
+    mov             r12, r3, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
+    mov             r12, r4, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_28_64)
+    vqrdmulh.s16    q4, q9, q0
+
+    mov             r12, r6, lsl #1
+    rsb             r12, #0
+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_4_64)
+    vqrdmulh.s16    q7, q9, q1
+
+    mov             r12, r5, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
+
+    ; dct_const_round_shift(- input[3] * cospi_20_64)
+    vqrdmulh.s16    q5, q11, q0
+
+    mov             r12, r7, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
+
+    ; dct_const_round_shift(input[3] * cospi_12_64)
+    vqrdmulh.s16    q6, q11, q1
+
+    ; stage 2 & stage 3 - even half
+    mov             r12, r8, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrdmulh.s16    q9, q8, q0
+
+    mov             r12, r9, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_24_64)
+    vqrdmulh.s16    q13, q10, q1
+
+    ; dct_const_round_shift(input[1] * cospi_8_64)
+    vqrdmulh.s16    q15, q10, q0
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct10_8x8_add_neon|
+
    END
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
+    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
+    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
+    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
+    ; them as buffer during calculation.
+    MACRO
+    IDCT4x4_1D
+    ; stage 1
+    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
+    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
+
+    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
+    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
+    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
+    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q10, #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16    q8,  q13, q14
+    vsub.s16    q9,  q13, q14
+    vswp        d18, d19
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
+    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
+    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
+    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
+    ; q14,q15 registers and use them as buffer during calculation.
+    MACRO
+    IADST4x4_1D
+    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
+    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
+    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
+    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
+    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
+    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
+    vaddw.s16   q15, q15, d19   ; x0 + x3
+    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
+    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
+    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
+
+    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
+    vadd.s32    q10, q10, q8
+    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
+    vdup.32     q8, r0          ; duplicate sinpi_3_9
+    vsub.s32    q11, q11, q9
+    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
+
+    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
+    vadd.s32    q10, q10, q11   ; x0 + x1
+    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
+    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d16, q13, #14
+    vqrshrn.s32 d17, q14, #14
+    vqrshrn.s32 d18, q15, #14
+    vqrshrn.s32 d19, q10, #14
+    MEND
+
+    ; Generate cosine constants in d6 - d8 for the IDCT
+    MACRO
+    GENERATE_COSINE_CONSTANTS
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov         r0, #0x3b00
+    add         r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov         r3, #0x2d00
+    add         r3, #0x41
+    ; cospi_24_64 = 6270 = 0x187e
+    mov         r12, #0x1800
+    add         r12, #0x7e
+
+    ; generate constant vectors
+    vdup.16     d0, r0          ; duplicate cospi_8_64
+    vdup.16     d1, r3          ; duplicate cospi_16_64
+    vdup.16     d2, r12         ; duplicate cospi_24_64
+    MEND
+
+    ; Generate sine constants in d1 - d4 for the IADST.
+    MACRO
+    GENERATE_SINE_CONSTANTS
+    ; sinpi_1_9 = 5283 = 0x14A3
+    mov         r0, #0x1400
+    add         r0, #0xa3
+    ; sinpi_2_9 = 9929 = 0x26C9
+    mov         r3, #0x2600
+    add         r3, #0xc9
+    ; sinpi_4_9 = 15212 = 0x3B6C
+    mov         r12, #0x3b00
+    add         r12, #0x6c
+
+    ; generate constant vectors
+    vdup.16     d3, r0          ; duplicate sinpi_1_9
+
+    ; sinpi_3_9 = 13377 = 0x3441
+    mov         r0, #0x3400
+    add         r0, #0x41
+
+    vdup.16     d4, r3          ; duplicate sinpi_2_9
+    vdup.16     d5, r12         ; duplicate sinpi_4_9
+    vdup.16     q3, r0          ; duplicate sinpi_3_9
+    MEND
+
+    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
+    MACRO
+    TRANSPOSE4X4
+    vtrn.16     d16, d17
+    vtrn.16     d18, d19
+    vtrn.32     q8, q9
+    MEND
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht4x4_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16    {q8,q9}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE4X4
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IDCT4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+idct_iadst
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IDCT4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+iadst_iadst
+    ; generate constants
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+end_vp9_short_iht4x4_add_neon
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16   q8, q8, #4
+    vrshr.s16   q9, q9, #4
+
+    vld1.32     {d26[0]}, [r1], r2
+    vld1.32     {d26[1]}, [r1], r2
+    vld1.32     {d27[0]}, [r1], r2
+    vld1.32     {d27[1]}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8    q8, q8, d26
+    vaddw.u8    q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb         r2, r2, #0
+    vst1.32     {d27[1]}, [r1], r2
+    vst1.32     {d27[0]}, [r1], r2
+    vst1.32     {d26[1]}, [r1], r2
+    vst1.32     {d26[0]}, [r1]  ; no post-increment
+    bx          lr
+    ENDP  ; |vp9_short_iht4x4_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -0,0 +1,696 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht8x8_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Generate IADST constants in r0 - r12 for the IADST.
+    MACRO
+    GENERATE_IADST_CONSTANTS
+    ; generate  cospi_2_64  = 16305
+    mov             r0, #0x3f00
+    add             r0, #0xb1
+
+    ; generate cospi_30_64 = 1606
+    mov             r1, #0x600
+    add             r1, #0x46
+
+    ; generate cospi_10_64 = 14449
+    mov             r2, #0x3800
+    add             r2, #0x71
+
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_18_64 = 10394
+    mov             r4, #0x2800
+    add             r4, #0x9a
+
+    ; generate cospi_14_64 = 12665
+    mov             r5, #0x3100
+    add             r5, #0x79
+
+    ; generate cospi_26_64 = 4756
+    mov             r6, #0x1200
+    add             r6, #0x94
+
+    ; generate cospi_6_64  = 15679
+    mov             r7, #0x3d00
+    add             r7, #0x3f
+
+    ; generate cospi_8_64  = 15137
+    mov             r8, #0x3b00
+    add             r8, #0x21
+
+    ; generate cospi_24_64 = 6270
+    mov             r9, #0x1800
+    add             r9, #0x7e
+
+    ; generate 0
+    mov             r10, #0
+
+    ; generate  cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+    MEND
+
+    ; Generate IDCT constants in r3 - r9 for the IDCT.
+    MACRO
+    GENERATE_IDCT_CONSTANTS
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+    MEND
+
+    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
+    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
+    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
+    ; registers and use them as buffer during calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14             ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14             ; >> 14
+    vqrshrn.s32     d23, q15, #14             ; >> 14
+
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14             ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14             ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
+    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
+    ; output will be stored back into q8-q15 registers. This macro will touch
+    ; q0 - q7 registers and use them as buffer during calculation.
+    MACRO
+    IADST8X8_1D
+    vdup.16         d14, r0                   ; duplicate cospi_2_64
+    vdup.16         d15, r1                   ; duplicate cospi_30_64
+
+    ; cospi_2_64  * x0
+    vmull.s16       q1, d30, d14
+    vmull.s16       q2, d31, d14
+
+    ; cospi_30_64 * x0
+    vmull.s16       q3, d30, d15
+    vmull.s16       q4, d31, d15
+
+    vdup.16         d30, r4                   ; duplicate cospi_18_64
+    vdup.16         d31, r5                   ; duplicate cospi_14_64
+
+    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+    vmlal.s16       q1, d16, d15
+    vmlal.s16       q2, d17, d15
+
+    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
+    vmlsl.s16       q3, d16, d14
+    vmlsl.s16       q4, d17, d14
+
+    ; cospi_18_64 * x4
+    vmull.s16       q5, d22, d30
+    vmull.s16       q6, d23, d30
+
+    ; cospi_14_64 * x4
+    vmull.s16       q7, d22, d31
+    vmull.s16       q8, d23, d31
+
+    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+    vmlal.s16       q5, d24, d31
+    vmlal.s16       q6, d25, d31
+
+    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
+    vmlsl.s16       q7, d24, d30
+    vmlsl.s16       q8, d25, d30
+
+    ; (s0 + s4)
+    vadd.s32        q11, q1, q5
+    vadd.s32        q12, q2, q6
+
+    vdup.16         d0, r2                   ; duplicate cospi_10_64
+    vdup.16         d1, r3                   ; duplicate cospi_22_64
+
+    ; (s0 - s4)
+    vsub.s32        q1, q1, q5
+    vsub.s32        q2, q2, q6
+
+    ; x0 = dct_const_round_shift(s0 + s4);
+    vqrshrn.s32     d22, q11, #14             ; >> 14
+    vqrshrn.s32     d23, q12, #14             ; >> 14
+
+    ; (s1 + s5)
+    vadd.s32        q12, q3, q7
+    vadd.s32        q15, q4, q8
+
+    ; (s1 - s5)
+    vsub.s32        q3, q3, q7
+    vsub.s32        q4, q4, q8
+
+    ; x4 = dct_const_round_shift(s0 - s4);
+    vqrshrn.s32     d2, q1, #14               ; >> 14
+    vqrshrn.s32     d3, q2, #14               ; >> 14
+
+    ; x1 = dct_const_round_shift(s1 + s5);
+    vqrshrn.s32     d24, q12, #14             ; >> 14
+    vqrshrn.s32     d25, q15, #14             ; >> 14
+
+    ; x5 = dct_const_round_shift(s1 - s5);
+    vqrshrn.s32     d6, q3, #14               ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; cospi_10_64 * x2
+    vmull.s16       q4, d26, d0
+    vmull.s16       q5, d27, d0
+
+    ; cospi_22_64 * x2
+    vmull.s16       q2, d26, d1
+    vmull.s16       q6, d27, d1
+
+    vdup.16         d30, r6                   ; duplicate cospi_26_64
+    vdup.16         d31, r7                   ; duplicate cospi_6_64
+
+    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+    vmlal.s16       q4, d20, d1
+    vmlal.s16       q5, d21, d1
+
+    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+    vmlsl.s16       q2, d20, d0
+    vmlsl.s16       q6, d21, d0
+
+    ; cospi_26_64 * x6
+    vmull.s16       q0, d18, d30
+    vmull.s16       q13, d19, d30
+
+    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+    vmlal.s16       q0, d28, d31
+    vmlal.s16       q13, d29, d31
+
+    ; cospi_6_64  * x6
+    vmull.s16       q10, d18, d31
+    vmull.s16       q9, d19, d31
+
+    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+    vmlsl.s16       q10, d28, d30
+    vmlsl.s16       q9, d29, d30
+
+    ; (s3 + s7)
+    vadd.s32        q14, q2, q10
+    vadd.s32        q15, q6, q9
+
+    ; (s3 - s7)
+    vsub.s32        q2, q2, q10
+    vsub.s32        q6, q6, q9
+
+    ; x3 = dct_const_round_shift(s3 + s7);
+    vqrshrn.s32     d28, q14, #14             ; >> 14
+    vqrshrn.s32     d29, q15, #14             ; >> 14
+
+    ; x7 = dct_const_round_shift(s3 - s7);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; (s2 + s6)
+    vadd.s32        q9, q4, q0
+    vadd.s32        q10, q5, q13
+
+    ; (s2 - s6)
+    vsub.s32        q4, q4, q0
+    vsub.s32        q5, q5, q13
+
+    vdup.16         d30, r8                   ; duplicate cospi_8_64
+    vdup.16         d31, r9                   ; duplicate cospi_24_64
+
+    ; x2 = dct_const_round_shift(s2 + s6);
+    vqrshrn.s32     d18, q9, #14              ; >> 14
+    vqrshrn.s32     d19, q10, #14             ; >> 14
+
+    ; x6 = dct_const_round_shift(s2 - s6);
+    vqrshrn.s32     d8, q4, #14               ; >> 14
+    vqrshrn.s32     d9, q5, #14               ; >> 14
+
+    ; cospi_8_64  * x4
+    vmull.s16       q5, d2, d30
+    vmull.s16       q6, d3, d30
+
+    ; cospi_24_64 * x4
+    vmull.s16       q7, d2, d31
+    vmull.s16       q0, d3, d31
+
+    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+    vmlal.s16       q5, d6, d31
+    vmlal.s16       q6, d7, d31
+
+    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+    vmlsl.s16       q7, d6, d30
+    vmlsl.s16       q0, d7, d30
+
+    ; cospi_8_64  * x7
+    vmull.s16       q1, d4, d30
+    vmull.s16       q3, d5, d30
+
+    ; cospi_24_64 * x7
+    vmull.s16       q10, d4, d31
+    vmull.s16       q2, d5, d31
+
+    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+    vmlsl.s16       q1, d8, d31
+    vmlsl.s16       q3, d9, d31
+
+    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+    vmlal.s16       q10, d8, d30
+    vmlal.s16       q2, d9, d30
+
+    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
+
+    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
+
+    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
+
+    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
+
+    ; (s4 + s6)
+    vadd.s32        q14, q5, q1
+    vadd.s32        q15, q6, q3
+
+    ; (s4 - s6)
+    vsub.s32        q5, q5, q1
+    vsub.s32        q6, q6, q3
+
+    ; x4 = dct_const_round_shift(s4 + s6);
+    vqrshrn.s32     d18, q14, #14             ; >> 14
+    vqrshrn.s32     d19, q15, #14             ; >> 14
+
+    ; x6 = dct_const_round_shift(s4 - s6);
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; (s5 + s7)
+    vadd.s32        q1, q7, q10
+    vadd.s32        q3, q0, q2
+
+    ; (s5 - s7))
+    vsub.s32        q7, q7, q10
+    vsub.s32        q0, q0, q2
+
+    ; x5 = dct_const_round_shift(s5 + s7);
+    vqrshrn.s32     d28, q1, #14               ; >> 14
+    vqrshrn.s32     d29, q3, #14               ; >> 14
+
+    ; x7 = dct_const_round_shift(s5 - s7);
+    vqrshrn.s32     d14, q7, #14              ; >> 14
+    vqrshrn.s32     d15, q0, #14              ; >> 14
+
+    vdup.16         d30, r12                  ; duplicate cospi_16_64
+
+    ; cospi_16_64 * x2
+    vmull.s16       q2, d22, d30
+    vmull.s16       q3, d23, d30
+
+    ; cospi_6_64  * x6
+    vmull.s16       q13, d22, d30
+    vmull.s16       q1, d23, d30
+
+    ; cospi_16_64 * x2 + cospi_16_64  * x3;
+    vmlal.s16       q2, d24, d30
+    vmlal.s16       q3, d25, d30
+
+    ; cospi_16_64 * x2 - cospi_16_64  * x3;
+    vmlsl.s16       q13, d24, d30
+    vmlsl.s16       q1, d25, d30
+
+    ; x2 = dct_const_round_shift(s2);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q3, #14               ; >> 14
+
+    ;x3 = dct_const_round_shift(s3);
+    vqrshrn.s32     d24, q13, #14             ; >> 14
+    vqrshrn.s32     d25, q1, #14              ; >> 14
+
+    ; cospi_16_64 * x6
+    vmull.s16       q13, d10, d30
+    vmull.s16       q1, d11, d30
+
+    ; cospi_6_64  * x6
+    vmull.s16       q11, d10, d30
+    vmull.s16       q0, d11, d30
+
+    ; cospi_16_64 * x6 + cospi_16_64  * x7;
+    vmlal.s16       q13, d14, d30
+    vmlal.s16       q1, d15, d30
+
+    ; cospi_16_64 * x6 - cospi_16_64  * x7;
+    vmlsl.s16       q11, d14, d30
+    vmlsl.s16       q0, d15, d30
+
+    ; x6 = dct_const_round_shift(s6);
+    vqrshrn.s32     d20, q13, #14             ; >> 14
+    vqrshrn.s32     d21, q1, #14              ; >> 14
+
+    ;x7 = dct_const_round_shift(s7);
+    vqrshrn.s32     d12, q11, #14             ; >> 14
+    vqrshrn.s32     d13, q0, #14              ; >> 14
+
+    vdup.16         q5, r10                   ; duplicate 0
+
+    vsub.s16        q9, q5, q9                ; output[1] = -x4;
+    vsub.s16        q11, q5, q2               ; output[3] = -x2;
+    vsub.s16        q13, q5, q6               ; output[5] = -x7;
+    vsub.s16        q15, q5, q4               ; output[7] = -x1;
+    MEND
+
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht8x8_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    push            {r0-r10}
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; first transform rows
+    IDCT8x8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; then transform columns
+    IADST8X8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+idct_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; then transform columns
+    IDCT8x8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+iadst_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; then transform columns
+    IADST8X8_1D
+
+end_vp9_short_iht8x8_add_neon
+    pop            {r0-r10}
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+    bx          lr
+    ENDP  ; |vp9_short_iht8x8_add_neon|
+
+    END
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -13,6 +13,7 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"

-void vp9_machine_specific_config(VP9_COMMON *ctx) {
+void vp9_machine_specific_config(VP9_COMMON *cm) {
+  (void)cm;
  vp9_rtcd();
 }
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -31,40 +31,30 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
    vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));
 }

-void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) {
-  int i, j;
-
-  // For each in image mode_info element set the in image flag to 1
-  for (i = 0; i < cm->mi_rows; i++) {
-    MODE_INFO *ptr = mi;
-    for (j = 0; j < cm->mi_cols; j++) {
-      ptr->mbmi.mb_in_image = 1;
-      ptr++;  // Next element in the row
-    }
-
-    // Step over border element at start of next row
-    mi += cm->mode_info_stride;
-  }
-}
-
-void vp9_free_frame_buffers(VP9_COMMON *oci) {
+void vp9_free_frame_buffers(VP9_COMMON *cm) {
  int i;

  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp9_free_frame_buffer(&oci->yv12_fb[i]);
+    vp9_free_frame_buffer(&cm->yv12_fb[i]);

-  vp9_free_frame_buffer(&oci->post_proc_buffer);
+  vp9_free_frame_buffer(&cm->post_proc_buffer);

-  vpx_free(oci->mip);
-  vpx_free(oci->prev_mip);
-  vpx_free(oci->above_seg_context);
+  vpx_free(cm->mip);
+  vpx_free(cm->prev_mip);
+  vpx_free(cm->above_seg_context);
+  vpx_free(cm->last_frame_seg_map);
+  vpx_free(cm->mi_grid_base);
+  vpx_free(cm->prev_mi_grid_base);

-  vpx_free(oci->above_context[0]);
+  vpx_free(cm->above_context[0]);
  for (i = 0; i < MAX_MB_PLANE; i++)
-    oci->above_context[i] = 0;
-  oci->mip = NULL;
-  oci->prev_mip = NULL;
-  oci->above_seg_context = NULL;
+    cm->above_context[i] = 0;
+  cm->mip = NULL;
+  cm->prev_mip = NULL;
+  cm->above_seg_context = NULL;
+  cm->last_frame_seg_map = NULL;
+  cm->mi_grid_base = NULL;
+  cm->prev_mi_grid_base = NULL;
 }

 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -72,112 +62,125 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
  cm->mb_rows = (aligned_height + 8) >> 4;
  cm->MBs = cm->mb_rows * cm->mb_cols;

-  cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
-  cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
  cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
 }

 static void setup_mi(VP9_COMMON *cm) {
  cm->mi = cm->mip + cm->mode_info_stride + 1;
  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;

  vpx_memset(cm->mip, 0,
             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
+  vpx_memset(cm->mi_grid_base, 0,
+             cm->mode_info_stride * (cm->mi_rows + 1) *
+             sizeof(*cm->mi_grid_base));

+  vp9_update_mode_info_border(cm, cm->mip);
  vp9_update_mode_info_border(cm, cm->prev_mip);
-  vp9_update_mode_info_in_image(cm, cm->prev_mi);
 }

-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
  int i, mi_cols;

-  const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
-  const int ss_x = oci->subsampling_x;
-  const int ss_y = oci->subsampling_y;
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  const int ss_x = cm->subsampling_x;
+  const int ss_y = cm->subsampling_y;
  int mi_size;

-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);

  for (i = 0; i < NUM_YV12_BUFFERS; i++) {
-    oci->fb_idx_ref_cnt[i] = 0;
-    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,
+    cm->fb_idx_ref_cnt[i] = 0;
+    if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
                               VP9BORDERINPIXELS) < 0)
      goto fail;
  }

-  oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
-  oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;
+  cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+  cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;

  for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
-    oci->active_ref_idx[i] = i;
+    cm->active_ref_idx[i] = i;

  for (i = 0; i < NUM_REF_FRAMES; i++) {
-    oci->ref_frame_map[i] = i;
-    oci->fb_idx_ref_cnt[i] = 1;
+    cm->ref_frame_map[i] = i;
+    cm->fb_idx_ref_cnt[i] = 1;
  }

-  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
+  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9BORDERINPIXELS) < 0)
    goto fail;

-  set_mb_mi(oci, aligned_width, aligned_height);
+  set_mb_mi(cm, aligned_width, aligned_height);

  // Allocation
-  mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);

-  oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->mip)
+  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->mip)
    goto fail;

-  oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->prev_mip)
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->prev_mip)
    goto fail;

-  setup_mi(oci);
+  cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+  if (!cm->mi_grid_base)
+    goto fail;
+
+  cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
+  if (!cm->prev_mi_grid_base)
+    goto fail;
+
+  setup_mi(cm);

  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
  // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
+  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);

  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
  // block where mi unit size is 8x8.
-# if CONFIG_ALPHA
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1);
-#else
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1);
-#endif
-  if (!oci->above_context[0])
+  cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
+                                         (2 * mi_cols), 1);
+  if (!cm->above_context[0])
    goto fail;

-  oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
-  if (!oci->above_seg_context)
+  cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
+  if (!cm->above_seg_context)
+    goto fail;
+
+  // Create the segmentation map structure and set to 0.
+  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
    goto fail;

  return 0;

 fail:
-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);
  return 1;
 }

-void vp9_create_common(VP9_COMMON *oci) {
-  vp9_machine_specific_config(oci);
+void vp9_create_common(VP9_COMMON *cm) {
+  vp9_machine_specific_config(cm);

-  vp9_init_mbmode_probs(oci);
+  vp9_init_mbmode_probs(cm);

-  oci->tx_mode = ONLY_4X4;
-  oci->comp_pred_mode = HYBRID_PREDICTION;
+  cm->tx_mode = ONLY_4X4;
+  cm->comp_pred_mode = HYBRID_PREDICTION;

  // Initialize reference frame sign bias structure to defaults
-  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 }

-void vp9_remove_common(VP9_COMMON *oci) {
-  vp9_free_frame_buffers(oci);
+void vp9_remove_common(VP9_COMMON *cm) {
+  vp9_free_frame_buffers(cm);
 }

 void vp9_initialize_common() {
@@ -188,8 +191,8 @@ void vp9_initialize_common() {

 void vp9_update_frame_size(VP9_COMMON *cm) {
  int i, mi_cols;
-  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);

  set_mb_mi(cm, aligned_width, aligned_height);
  setup_mi(cm);
@@ -198,4 +201,8 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
  for (i = 1; i < MAX_MB_PLANE; i++)
    cm->above_context[i] =
        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
+
+  // Initialize the previous frame segment map to 0.
+  if (cm->last_frame_seg_map)
+    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -16,14 +16,13 @@

 void vp9_initialize_common();

-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);

-void vp9_create_common(VP9_COMMON *oci);
-void vp9_remove_common(VP9_COMMON *oci);
+void vp9_create_common(VP9_COMMON *cm);
+void vp9_remove_common(VP9_COMMON *cm);

-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_free_frame_buffers(VP9_COMMON *oci);
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
+void vp9_free_frame_buffers(VP9_COMMON *cm);


 void vp9_update_frame_size(VP9_COMMON *cm);
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -19,9 +19,9 @@

 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"

@@ -71,7 +71,7 @@ typedef enum {
  D135_PRED,       // Directional 135 deg = 180 - 45
  D117_PRED,       // Directional 117 deg = 180 - 63
  D153_PRED,       // Directional 153 deg = 180 - 27
-  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D207_PRED,       // Directional 207 deg = 180 + 27
  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
  TM_PRED,         // True-motion
  NEARESTMV,
@@ -89,9 +89,9 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
  return mode >= NEARESTMV && mode <= NEWMV;
 }

-#define VP9_INTRA_MODES (TM_PRED + 1)
+#define INTRA_MODES (TM_PRED + 1)

-#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)
+#define INTER_MODES (1 + NEWMV - NEARESTMV)

 static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
  return (mode - NEARESTMV);
@@ -115,45 +115,41 @@ typedef enum {
  MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;

-static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
  return b_width_log2_lookup[sb_type];
 }
-static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
  return b_height_log2_lookup[sb_type];
 }

-static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
  return mi_width_log2_lookup[sb_type];
 }

-static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
  return mi_height_log2_lookup[sb_type];
 }

+// This structure now relates to 8x8 block regions.
 typedef struct {
  MB_PREDICTION_MODE mode, uv_mode;
  MV_REFERENCE_FRAME ref_frame[2];
-  TX_SIZE txfm_size;
-  int_mv mv[2]; // for each reference frame used
+  TX_SIZE tx_size;
+  int_mv mv[2];                // for each reference frame used
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  int_mv best_mv, best_second_mv;

-  uint8_t mb_mode_context[MAX_REF_FRAMES];
+  uint8_t mode_context[MAX_REF_FRAMES];

-  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-  unsigned char segment_id;           // Segment id for current frame
+  unsigned char skip_coeff;    // 0=need to decode coeffs, 1=no coefficients
+  unsigned char segment_id;    // Segment id for this block.

-  // Flags used for prediction status of various bistream signals
+  // Flags used for prediction status of various bit-stream signals
  unsigned char seg_id_predicted;

-  // Indicates if the mb is part of the image (1) vs border (0)
-  // This can be useful in determining whether the MB provides
-  // a valid predictor
-  unsigned char mb_in_image;
-
  INTERPOLATIONFILTERTYPE interp_filter;

-  BLOCK_SIZE_TYPE sb_type;
+  BLOCK_SIZE sb_type;
 } MB_MODE_INFO;

 typedef struct {
@@ -161,36 +157,19 @@ typedef struct {
  union b_mode_info bmi[4];
 } MODE_INFO;

-static int is_inter_block(const MB_MODE_INFO *mbmi) {
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
  return mbmi->ref_frame[0] > INTRA_FRAME;
 }

+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[1] > INTRA_FRAME;
+}

 enum mv_precision {
  MV_PRECISION_Q3,
  MV_PRECISION_Q4
 };

-#define VP9_REF_SCALE_SHIFT 14
-#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
-
-struct scale_factors {
-  int x_scale_fp;   // horizontal fixed point scale factor
-  int y_scale_fp;   // vertical fixed point scale factor
-  int x_offset_q4;
-  int x_step_q4;
-  int y_offset_q4;
-  int y_step_q4;
-
-  int (*scale_value_x)(int val, const struct scale_factors *scale);
-  int (*scale_value_y)(int val, const struct scale_factors *scale);
-  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
-  MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
-  MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
-
-  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-};
-
 #if CONFIG_ALPHA
 enum { MAX_MB_PLANE = 4 };
 #else
@@ -216,45 +195,27 @@ struct macroblockd_plane {
  ENTROPY_CONTEXT *left_context;
 };

-#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
-
-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      2
-
-struct loopfilter {
-  int filter_level;
-
-  int sharpness_level;
-  int last_sharpness_level;
-
-  uint8_t mode_ref_delta_enabled;
-  uint8_t mode_ref_delta_update;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char ref_deltas[MAX_REF_LF_DELTAS];
-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
-
-  // 0 = ZERO_MV, MV
-  signed char mode_deltas[MAX_MODE_LF_DELTAS];
-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
-};
+#define BLOCK_OFFSET(x, i) ((x) + (i) * 16)

 typedef struct macroblockd {
  struct macroblockd_plane plane[MAX_MB_PLANE];

  struct scale_factors scale_factor[2];

-  MODE_INFO *prev_mode_info_context;
-  MODE_INFO *mode_info_context;
+  MODE_INFO *last_mi;
+  MODE_INFO *this_mi;
  int mode_info_stride;

+  MODE_INFO *mic_stream_ptr;
+
+  // A NULL indicates that the 8x8 is not part of the image
+  MODE_INFO **mi_8x8;
+  MODE_INFO **prev_mi_8x8;
+
  int up_available;
  int left_available;
  int right_available;

-  struct segmentation seg;
-  struct loopfilter lf;
-
  // partition contexts
  PARTITION_CONTEXT *above_seg_context;
  PARTITION_CONTEXT *left_seg_context;
@@ -286,7 +247,7 @@ typedef struct macroblockd {

 } MACROBLOCKD;

-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
  switch (subsize) {
    case BLOCK_64X64:
    case BLOCK_64X32:
@@ -311,9 +272,8 @@ static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsi
  }
 }

-static INLINE void update_partition_context(MACROBLOCKD *xd,
-                                            BLOCK_SIZE_TYPE sb_type,
-                                            BLOCK_SIZE_TYPE sb_size) {
+static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
+                                            BLOCK_SIZE sb_size) {
  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
  const int bwl = b_width_log2(sb_type);
  const int bhl = b_height_log2(sb_type);
@@ -331,8 +291,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }

-static INLINE int partition_plane_context(MACROBLOCKD *xd,
-                                          BLOCK_SIZE_TYPE sb_type) {
+static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
  int above = 0, left = 0, i;
  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
@@ -352,10 +311,9 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }

-static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
-                                   PARTITION_TYPE partition) {
-  BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
-  assert(subsize != BLOCK_SIZE_TYPES);
+static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
+  assert(subsize < BLOCK_SIZES);
  return subsize;
 }

@@ -363,7 +321,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];

 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->mode_info_context;
+  const MODE_INFO *const mi = xd->this_mi;
  const MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (plane_type != PLANE_TYPE_Y_WITH_DC ||
@@ -378,13 +336,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd) {
  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
 }

 static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
                                        const MACROBLOCKD *xd) {
  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
 }

 static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
@@ -404,259 +362,147 @@ static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {


 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
+  return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
 }

-struct plane_block_idx {
-  int plane;
-  int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                     int b_idx) {
-  const int v_offset = y_blocks * 5 / 4;
-  struct plane_block_idx res;
-
-  if (b_idx < y_blocks) {
-    res.plane = 0;
-    res.block = b_idx;
-  } else if (b_idx < v_offset) {
-    res.plane = 1;
-    res.block = b_idx - y_blocks;
-  } else {
-    assert(b_idx < y_blocks * 3 / 2);
-    res.plane = 2;
-    res.block = b_idx - v_offset;
-  }
-  return res;
+static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                       const struct macroblockd_plane *pd) {
+  BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+  assert(bs < BLOCK_SIZES);
+  return bs;
 }

-static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_width(BLOCK_SIZE bsize,
                                    const struct macroblockd_plane* plane) {
  return 4 << (b_width_log2(bsize) - plane->subsampling_x);
 }

-static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_height(BLOCK_SIZE bsize,
                                     const struct macroblockd_plane* plane) {
  return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }

-static INLINE int plane_block_width_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_width_log2(bsize) - plane->subsampling_x);
-}
-
-static INLINE int plane_block_height_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_height_log2(bsize) - plane->subsampling_y);
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
-                                                  BLOCK_SIZE_TYPE bsize,
-                                                  int ss_txfrm_size,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size,
                                                  void *arg);

 static INLINE void foreach_transformed_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
    foreach_transformed_block_visitor visit, void *arg) {
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi;
  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
  // transform size varies per plane, look it up in a common way.
-  const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
-                                : mbmi->txfm_size;
-  const int block_size_b = bw + bh;
-  const int txfrm_size_b = tx_size * 2;
-
-  // subsampled size of the block
-  const int ss_sum = xd->plane[plane].subsampling_x
-      + xd->plane[plane].subsampling_y;
-  const int ss_block_size = block_size_b - ss_sum;
-
-  const int step = 1 << txfrm_size_b;
-
+                                : mbmi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int step = 1 << (tx_size << 1);
  int i;

-  assert(txfrm_size_b <= block_size_b);
-  assert(txfrm_size_b <= ss_block_size);
-
  // If mb_to_right_edge is < 0 we are in a situation in which
  // the current block size extends into the UMV and we won't
  // visit the sub blocks that are wholly within the UMV.
  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
    int r, c;
-    const int sw = bw - xd->plane[plane].subsampling_x;
-    const int sh = bh - xd->plane[plane].subsampling_y;
-    int max_blocks_wide = 1 << sw;
-    int max_blocks_high = 1 << sh;
+
+    int max_blocks_wide = num_4x4_w;
+    int max_blocks_high = num_4x4_h;

    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
    // it to 4x4 block sizes.
    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide +=
-          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));

    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high +=
-          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));

    i = 0;
    // Unlike the normal case - in here we have to keep track of the
    // row and column of the blocks we use so that we know if we are in
    // the unrestricted motion border.
-    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
-      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
+    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
+      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, bsize, txfrm_size_b, arg);
+          visit(plane, i, plane_bsize, tx_size, arg);
        i += step;
      }
    }
  } else {
-    for (i = 0; i < (1 << ss_block_size); i += step) {
-      visit(plane, i, bsize, txfrm_size_b, arg);
-    }
+    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
+      visit(plane, i, plane_bsize, tx_size, arg);
  }
 }

 static INLINE void foreach_transformed_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
    foreach_transformed_block_visitor visit, void *arg) {
  int plane;

-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 0; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }

 static INLINE void foreach_transformed_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
    foreach_transformed_block_visitor visit, void *arg) {
  int plane;

-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 1; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }

-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE_TYPE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int i, x, y;
-
-  // block sizes in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // subsampled size of the block
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  // size of the predictor to use.
-  int pred_w, pred_h;
-
-  if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
-    assert(bsize == BLOCK_8X8);
-    pred_w = 0;
-    pred_h = 0;
-  } else {
-    pred_w = bwl;
-    pred_h = bhl;
-  }
-  assert(pred_w <= bwl);
-  assert(pred_h <= bhl);
-
-  // visit each subblock in raster order
-  i = 0;
-  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
-    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
-      visit(plane, i, bsize, pred_w, pred_h, arg);
-      i += 1 << pred_w;
-    }
-    i += (1 << (bwl + pred_h)) - (1 << bwl);
-  }
-}
-static INLINE void foreach_predicted_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
-  }
-}
-static INLINE void foreach_predicted_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
-  }
-}
-static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                               int plane, int block, int stride) {
-  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+                               int raster_block, int stride) {
+  const int bw = b_width_log2(plane_bsize);
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
  return y * stride + x;
 }
-static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,
-                                         BLOCK_SIZE_TYPE bsize,
-                                         int plane, int block, int16_t *base) {
-  const int stride = plane_block_width(bsize, &xd->plane[plane]);
-  return base + raster_block_offset(xd, bsize, plane, block, stride);
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                          int raster_block, int16_t *base) {
+  const int stride = 4 << b_width_log2(plane_bsize);
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
-static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd,
-                                         BLOCK_SIZE_TYPE bsize,
-                                         int plane, int block,
-                                         uint8_t *base, int stride) {
-  return base + raster_block_offset(xd, bsize, plane, block, stride);
+static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
+                                          int raster_block, uint8_t *base,
+                                          int stride) {
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
 }

-static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
-                                       BLOCK_SIZE_TYPE bsize,
-                                       int plane, int block,
-                                       int ss_txfrm_size) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
+static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, int block) {
+  const int bwl = b_width_log2(plane_bsize);
+  const int tx_cols_log2 = bwl - tx_size;
  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> ss_txfrm_size;
-  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
-  const int y = raster_mb >> tx_cols_log2 << (txwl);
+  const int raster_mb = block >> (tx_size << 1);
+  const int x = (raster_mb & (tx_cols - 1)) << tx_size;
+  const int y = (raster_mb >> tx_cols_log2) << tx_size;
  return x + (y << bwl);
 }

-static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
-                                     BLOCK_SIZE_TYPE bsize,
-                                     int plane, int block,
-                                     int ss_txfrm_size,
+static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
+                                     TX_SIZE tx_size, int block,
                                     int *x, int *y) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
+  const int bwl = b_width_log2(plane_bsize);
+  const int tx_cols_log2 = bwl - tx_size;
  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> ss_txfrm_size;
-  *x = (raster_mb & (tx_cols - 1)) << (txwl);
-  *y = raster_mb >> tx_cols_log2 << (txwl);
+  const int raster_mb = block >> (tx_size << 1);
+  *x = (raster_mb & (tx_cols - 1)) << tx_size;
+  *y = (raster_mb >> tx_cols_log2) << tx_size;
 }

-static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
-                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {
-  const int bw = plane_block_width(bsize, &xd->plane[plane]);
-  const int bh = plane_block_height(bsize, &xd->plane[plane]);
+static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+                             int plane, int block, TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const buf = pd->dst.buf;
+  const int stride = pd->dst.stride;
+
  int x, y;
-  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
  x = x * 4 - 1;
  y = y * 4 - 1;
  // Copy a pixel into the umv if we are in a situation where the block size
@@ -664,41 +510,38 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
  // TODO(JBB): Should be able to do the full extend in place so we don't have
  // to do this multiple times.
  if (xd->mb_to_right_edge < 0) {
-    int umv_border_start = bw
-        + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x));
+    const int bw = 4 << b_width_log2(plane_bsize);
+    const int umv_border_start = bw + (xd->mb_to_right_edge >>
+                                       (3 + pd->subsampling_x));

    if (x + bw > umv_border_start)
-      vpx_memset(
-          xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
-              + umv_border_start,
-          *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
-              + umv_border_start - 1),
-          bw);
+      vpx_memset(&buf[y * stride + umv_border_start],
+                 buf[y * stride + umv_border_start - 1], bw);
  }
-  if (xd->mb_to_bottom_edge < 0) {
-    int umv_border_start = bh
-        + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y));
-    int i;
-    uint8_t c = *(xd->plane[plane].dst.buf
-        + (umv_border_start - 1) * xd->plane[plane].dst.stride + x);

-    uint8_t *d = xd->plane[plane].dst.buf
-        + umv_border_start * xd->plane[plane].dst.stride + x;
+  if (xd->mb_to_bottom_edge < 0) {
+    const int bh = 4 << b_height_log2(plane_bsize);
+    const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
+                                       (3 + pd->subsampling_y));
+    int i;
+    const uint8_t c = buf[(umv_border_start - 1) * stride + x];
+    uint8_t *d = &buf[umv_border_start * stride + x];

    if (y + bh > umv_border_start)
-      for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride)
+      for (i = 0; i < bh; ++i, d += stride)
        *d = c;
  }
 }
-static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                                   int plane, int tx_size_in_blocks,
-                                   int eob, int aoff, int loff,
+static void set_contexts_on_border(MACROBLOCKD *xd,
+                                   struct macroblockd_plane *pd,
+                                   BLOCK_SIZE plane_bsize,
+                                   int tx_size_in_blocks, int has_eob,
+                                   int aoff, int loff,
                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  struct macroblockd_plane *pd = &xd->plane[plane];
+  int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
  int above_contexts = tx_size_in_blocks;
  int left_contexts = tx_size_in_blocks;
-  int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
-  int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
  int pt;

  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
@@ -706,26 +549,47 @@ static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
  if (xd->mb_to_right_edge < 0)
    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));

+  if (xd->mb_to_bottom_edge < 0)
+    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
  // this code attempts to avoid copying into contexts that are outside
  // our border.  Any blocks that do are set to 0...
  if (above_contexts + aoff > mi_blocks_wide)
    above_contexts = mi_blocks_wide - aoff;

-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
  if (left_contexts + loff > mi_blocks_high)
    left_contexts = mi_blocks_high - loff;

  for (pt = 0; pt < above_contexts; pt++)
-    A[pt] = eob > 0;
+    A[pt] = has_eob;
  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
    A[pt] = 0;
  for (pt = 0; pt < left_contexts; pt++)
-    L[pt] = eob > 0;
+    L[pt] = has_eob;
  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
    L[pt] = 0;
 }

+static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                         int has_eob, int aoff, int loff) {
+  ENTROPY_CONTEXT *const A = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const L = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
+                           aoff, loff, A, L);
+  } else {
+    vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+static int get_tx_eob(struct segmentation *seg, int segment_id,
+                      TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}

 #endif  // VP9_COMMON_VP9_BLOCKD_H_
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -13,33 +13,33 @@
 #include "vp9/common/vp9_common_data.h"

 // Log 2 conversion lookup tables for block width and height
-const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
+const int b_width_log2_lookup[BLOCK_SIZES] =
  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
-const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
+const int b_height_log2_lookup[BLOCK_SIZES] =
  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+const int num_4x4_blocks_high_lookup[BLOCK_SIZES] =
  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
 // Log 2 conversion lookup tables for modeinfo width and height
-const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
+const int mi_width_log2_lookup[BLOCK_SIZES] =
  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
+const int mi_height_log2_lookup[BLOCK_SIZES] =
  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
-const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};

 // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
-const int size_group_lookup[BLOCK_SIZE_TYPES] =
+const int size_group_lookup[BLOCK_SIZES] =
  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};

-const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
+const int num_pels_log2_lookup[BLOCK_SIZES] =
  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};


-const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
  {  // 4X4
    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
@@ -74,51 +74,62 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
  }
 };

-const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
+const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
  {     // PARTITION_NONE
-    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
-    BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
    BLOCK_64X64,
  }, {  // PARTITION_HORZ
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_64X32,
  }, {  // PARTITION_VERT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_32X64,
  }, {  // PARTITION_SPLIT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_32X32,
  }
 };

-const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
+const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
  TX_16X16, TX_16X16, TX_16X16,
  TX_32X32, TX_32X32, TX_32X32, TX_32X32
 };
-const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
+const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
  TX_16X16, TX_16X16, TX_16X16, TX_32X32
 };

-const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
-  { BLOCK_4X4,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8 },
-  { BLOCK_8X4,   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X16,  BLOCK_8X16 },
-  { BLOCK_16X8,  BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
-  { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
-  { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
+const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
+//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+  {{BLOCK_4X4,   BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_4X8,   BLOCK_4X4},     {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_8X4,   BLOCK_INVALID}, {BLOCK_4X4,     BLOCK_INVALID}},
+  {{BLOCK_8X8,   BLOCK_8X4},     {BLOCK_4X8,     BLOCK_4X4}},
+  {{BLOCK_8X16,  BLOCK_8X8},     {BLOCK_INVALID, BLOCK_4X8}},
+  {{BLOCK_16X8,  BLOCK_INVALID}, {BLOCK_8X8,     BLOCK_8X4}},
+  {{BLOCK_16X16, BLOCK_16X8},    {BLOCK_8X16,    BLOCK_8X8}},
+  {{BLOCK_16X32, BLOCK_16X16},   {BLOCK_INVALID, BLOCK_8X16}},
+  {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16,   BLOCK_16X8}},
+  {{BLOCK_32X32, BLOCK_32X16},   {BLOCK_16X32,   BLOCK_16X16}},
+  {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
+  {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
+  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };
+
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -13,20 +13,20 @@

 #include "vp9/common/vp9_enums.h"

-extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int size_group_lookup[BLOCK_SIZE_TYPES];
-extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
-extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5];
+extern const int b_width_log2_lookup[BLOCK_SIZES];
+extern const int b_height_log2_lookup[BLOCK_SIZES];
+extern const int mi_width_log2_lookup[BLOCK_SIZES];
+extern const int mi_height_log2_lookup[BLOCK_SIZES];
+extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
+extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
+extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
+extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES];
+extern const int size_group_lookup[BLOCK_SIZES];
+extern const int num_pels_log2_lookup[BLOCK_SIZES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
+extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
+extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
+extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
+extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];

 #endif    // VP9_COMMON_VP9_COMMON_DATA_H
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -14,66 +14,45 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

-#define VP9_FILTER_WEIGHT 128
-#define VP9_FILTER_SHIFT  7
-
-/* Assume a bank of 16 filters to choose from. There are two implementations
- * for filter wrapping behavior, since we want to be able to pick which filter
- * to start with. We could either:
- *
- * 1) make filter_ a pointer to the base of the filter array, and then add an
- *    additional offset parameter, to choose the starting filter.
- * 2) use a pointer to 2 periods worth of filters, so that even if the original
- *    phase offset is at 15/16, we'll have valid data to read. The filter
- *    tables become [32][8], and the second half is duplicated.
- * 3) fix the alignment of the filter tables, so that we know the 0/16 is
- *    always 256 byte aligned.
- *
- * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
- * parameters, and switching between them is trivial, with the
- * ALIGN_FILTERS_256 macro, below.
- */
- #define ALIGN_FILTERS_256 1
-
 static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x0, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h, int taps) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
+  int x, y, k;

-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source line */
  src -= taps / 2 - 1;

  for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
    /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (filter_x0 - filter_x_base) / taps;

    for (x = 0; x < w; ++x) {
      /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);

-      /* Adjust source and filter to use for the next pixel */
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+
+      /* Move to the next source pixel */
      x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
    }
    src += src_stride;
    dst += dst_stride;
@@ -85,37 +64,37 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_x0, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int taps) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
+  int x, y, k;

-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source line */
  src -= taps / 2 - 1;

  for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
    /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (filter_x0 - filter_x_base) / taps;

    for (x = 0; x < w; ++x) {
      /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

-      /* Adjust source and filter to use for the next pixel */
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+                   clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+
+      /* Move to the next source pixel */
      x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
    }
    src += src_stride;
    dst += dst_stride;
@@ -127,37 +106,37 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y0, int y_step_q4,
                            int w, int h, int taps) {
-  int x, y, k, sum;
+  int x, y, k;

-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source column */
  src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;

+  for (x = 0; x < w; ++x) {
    /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (filter_y0 - filter_y_base) / taps;

    for (y = 0; y < h; ++y) {
      /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);

-      /* Adjust source and filter to use for the next pixel */
+      dst[y * dst_stride] =
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+
+      /* Move to the next source pixel */
      y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
    }
    ++src;
    ++dst;
@@ -169,38 +148,37 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y0, int y_step_q4,
                                int w, int h, int taps) {
-  int x, y, k, sum;
+  int x, y, k;

-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source column */
  src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;

+  for (x = 0; x < w; ++x) {
    /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (filter_y0 - filter_y_base) / taps;

    for (y = 0; y < h; ++y) {
      /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] =
-          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

-      /* Adjust source and filter to use for the next pixel */
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+
+      /* Move to the next source pixel */
      y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
    }
    ++src;
    ++dst;
@@ -213,58 +191,27 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h, int taps) {
  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 135, for y_step_q4 == 32,
+   * Maximum intermediate_height is 324, for y_step_q4 == 80,
   * h == 64, taps == 8.
+   * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
   */
-  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+  uint8_t temp[64 * 324];
+  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;

  assert(w <= 64);
  assert(h <= 64);
  assert(taps <= 8);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 80);
+  assert(x_step_q4 <= 80);

  if (intermediate_height < h)
    intermediate_height = h;

-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, intermediate_height, taps);
-  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h, taps);
-}
-
-static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h, int taps) {
-  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 135, for y_step_q4 == 32,
-   * h == 64, taps == 8.
-   */
-  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(taps <= 8);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (intermediate_height < h)
-    intermediate_height = h;
-
-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, intermediate_height, taps);
-  convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, taps);
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
+                   filter_x, x_step_q4, filter_y, y_step_q4, w,
+                   intermediate_height, taps);
+  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x,
+                  x_step_q4, filter_y, y_step_q4, w, h, taps);
 }

 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -273,8 +220,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
  convolve_horiz_c(src, src_stride, dst, dst_stride,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, h, 8);
+                   filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -283,8 +229,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8);
+                       filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -293,8 +238,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
  convolve_vert_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h, 8);
+                  filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -303,8 +247,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
  convolve_avg_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8);
+                      filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -313,8 +256,7 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
  convolve_c(src, src_stride, dst, dst_stride,
-             filter_x, x_step_q4, filter_y, y_step_q4,
-             w, h, 8);
+             filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -327,16 +269,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  assert(w <= 64);
  assert(h <= 64);

-  vp9_convolve8(src, src_stride,
-                temp, 64,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_avg(temp, 64,
-                   dst, dst_stride,
-                   NULL, 0, /* These unused parameter should be removed! */
-                   NULL, 0, /* These unused parameter should be removed! */
-                   w, h);
+  vp9_convolve8(src, src_stride, temp, 64,
+               filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
 }

 void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -361,9 +296,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  int x, y;

  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = (dst[x] + src[x] + 1) >> 1;
-    }
+    for (x = 0; x < w; ++x)
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
    src += src_stride;
    dst += dst_stride;
  }
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -13,6 +13,8 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"

+#define FILTER_BITS 7
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -22,23 +22,24 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
 * and uses the passed in member offset to print out the value of an integer
 * for each mbmi member value in the mi structure.
 */
-static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
                          size_t member_offset) {
  int mi_row;
  int mi_col;
  int mi_index = 0;
-  MODE_INFO *mi = common->mi;
-  int rows = common->mi_rows;
-  int cols = common->mi_cols;
+  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
  char prefix = descriptor[0];

-  log_frame_info(common, descriptor, file);
+  log_frame_info(cm, descriptor, file);
  mi_index = 0;
  for (mi_row = 0; mi_row < rows; mi_row++) {
    fprintf(file, "%c ", prefix);
    for (mi_col = 0; mi_col < cols; mi_col++) {
      fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
+              *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) +
+                        member_offset)));
      mi_index++;
    }
    fprintf(file, "\n");
@@ -51,23 +52,23 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
  int mi_col;
  int mi_index = 0;
  FILE *mvs = fopen(file, "a");
-  MODE_INFO *mi = cm->mi;
+  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
  int rows = cm->mi_rows;
  int cols = cm->mi_cols;

  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
-  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));

  log_frame_info(cm, "Vectors ",mvs);
  for (mi_row = 0; mi_row < rows; mi_row++) {
    fprintf(mvs,"V ");
    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
-              mi[mi_index].mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
+                               mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
      mi_index++;
    }
    fprintf(mvs, "\n");
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -377,7 +377,7 @@ static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {

 static void extend_model_to_full_distribution(vp9_prob p,
                                              vp9_prob *tree_probs) {
-  const int l = ((p - 1) / 2);
+  const int l = (p - 1) / 2;
  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
  if (p & 1) {
    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
@@ -436,11 +436,11 @@ const vp9_extra_bit vp9_extra_bits[12] = {

 #include "vp9/common/vp9_default_coef_probs.h"

-void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+void vp9_default_coef_probs(VP9_COMMON *cm) {
+  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }

 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -622,7 +622,6 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
  int t, i, j, k, l;
  unsigned int branch_ct[UNCONSTRAINED_NODES][2];
  vp9_prob coef_probs[UNCONSTRAINED_NODES];
-  int entropy_nodes_adapt = UNCONSTRAINED_NODES;

  for (i = 0; i < BLOCK_TYPES; ++i)
    for (j = 0; j < REF_TYPES; ++j)
@@ -635,7 +634,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                                           0);
          branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < entropy_nodes_adapt; ++t)
+          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
            dst_coef_probs[i][j][k][l][t] = merge_probs(
                pre_coef_probs[i][j][k][l][t], coef_probs[t],
                branch_ct[t], count_sat, update_factor);
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -95,7 +95,7 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 #define MODULUS_PARAM               13  /* Modulus parameter */

 struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *);
+void vp9_default_coef_probs(struct VP9Common *cm);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);

 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
@@ -154,19 +154,17 @@ extern DECLARE_ALIGNED(16, int16_t,
                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);

 void vp9_coef_tree_initialize(void);
-void vp9_adapt_coef_probs(struct VP9Common *);
+void vp9_adapt_coef_probs(struct VP9Common *cm);

-static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd,
-                                               BLOCK_SIZE_TYPE bsize) {
-  /* Clear entropy contexts */
-  const int bw = 1 << b_width_log2(bsize);
-  const int bh = 1 << b_height_log2(bsize);
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
  int i;
  for (i = 0; i < MAX_MB_PLANE; i++) {
-    vpx_memset(xd->plane[i].above_context, 0,
-               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x);
-    vpx_memset(xd->plane[i].left_context, 0,
-               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y);
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
+                   num_4x4_blocks_wide_lookup[plane_bsize]);
+    vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
+                   num_4x4_blocks_high_lookup[plane_bsize]);
  }
 }

@@ -338,6 +336,45 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
  }
 }

+static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                               PLANE_TYPE type, int block_idx,
+                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                               const int16_t **scan,
+                               const uint8_t **band_translate) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
+      *band_translate = vp9_coefband_trans_4x4;
+      above_ec = A[0] != 0;
+      left_ec = L[0] != 0;
+      break;
+    case TX_8X8:
+      *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint16_t *)A;
+      left_ec  = !!*(uint16_t *)L;
+      break;
+    case TX_16X16:
+      *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint32_t *)A;
+      left_ec  = !!*(uint32_t *)L;
+      break;
+    case TX_32X32:
+      *scan = vp9_default_scan_32x32;
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint64_t *)A;
+      left_ec  = !!*(uint64_t *)L;
+      break;
+    default:
+      assert(!"Invalid transform size.");
+  }
+
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
 enum { VP9_COEF_UPDATE_PROB = 252 };

 #endif  // VP9_COMMON_VP9_ENTROPY_H_
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -14,8 +14,8 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"

-const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
-                                  [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES]
+                                  [INTRA_MODES - 1] = {
  { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
  { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
  { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
@@ -23,21 +23,21 @@ const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
  { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
  { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
  { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
-  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d207 */,
  { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
  { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
 };

 static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]
-                                        [VP9_INTRA_MODES - 1] = {
+                                        [INTRA_MODES - 1] = {
  {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* block_size < 8x8 */,
  { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* block_size < 16x16 */,
  { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* block_size < 32x32 */,
  { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* block_size >= 32x32 */
 };

-static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
-                                         [VP9_INTRA_MODES - 1] = {
+static const vp9_prob default_if_uv_probs[INTRA_MODES]
+                                         [INTRA_MODES - 1] = {
  { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
  {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
  {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
@@ -45,7 +45,7 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
  {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
  {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
  {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
-  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d207 */,
  {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
  { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
 };
@@ -98,9 +98,9 @@ static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
  }
 };

-const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
+                                 [INTRA_MODES]
+                                 [INTRA_MODES - 1] = {
  { /* above = dc */
    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
@@ -109,7 +109,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d207 */,
    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
  }, { /* above = v */
@@ -120,7 +120,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d207 */,
    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
  }, { /* above = h */
@@ -131,7 +131,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d207 */,
    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
  }, { /* above = d45 */
@@ -142,7 +142,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d207 */,
    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
  }, { /* above = d135 */
@@ -153,7 +153,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d207 */,
    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
  }, { /* above = d117 */
@@ -164,7 +164,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d207 */,
    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
  }, { /* above = d153 */
@@ -175,10 +175,10 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d207 */,
    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-  }, { /* above = d27 */
+  }, { /* above = d207 */
    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
@@ -186,7 +186,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d207 */,
    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
  }, { /* above = d63 */
@@ -197,7 +197,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d207 */,
    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
  }, { /* above = tm */
@@ -208,14 +208,14 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d207 */,
    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
  }
 };

 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
-                                              [VP9_INTER_MODES - 1] = {
+                                              [INTER_MODES - 1] = {
  {2,       173,   34},  // 0 = both zero mv
  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
  {7,       166,   63},  // 2 = two predicted mvs
@@ -226,7 +226,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
 };

 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
  -DC_PRED, 2,                      /* 0 = DC_NODE */
  -TM_PRED, 4,                      /* 1 = TM_NODE */
  -V_PRED, 6,                       /* 2 = V_NODE */
@@ -235,7 +235,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
  -D45_PRED, 14,                    /* 6 = D45_NODE */
  -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };

 const vp9_tree_index vp9_inter_mode_tree[6] = {
@@ -250,8 +250,8 @@ const vp9_tree_index vp9_partition_tree[6] = {
  -PARTITION_VERT, -PARTITION_SPLIT
 };

-struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];

 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

@@ -317,8 +317,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
  192, 128, 64
 };

-static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1]
-                                                  [VP9_SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
+                                                  [SWITCHABLE_FILTERS-1] = {
  { 235, 162, },
  { 36, 255, },
  { 34, 3, },
@@ -338,11 +338,11 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
  vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
 }

-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
  -EIGHTTAP, 2,
  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];

 void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
@@ -400,17 +400,17 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                                             counts->single_ref[i][j]);

  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
+    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
                      fc->inter_mode_probs[i], NEARESTMV);

  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
                      counts->y_mode[i], pre_fc->y_mode_prob[i],
                      fc->y_mode_prob[i], 0);

-  for (i = 0; i < VP9_INTRA_MODES; ++i)
-    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+  for (i = 0; i < INTRA_MODES; ++i)
+    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
                      fc->uv_mode_prob[i], 0);

@@ -421,8 +421,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                      fc->partition_prob[INTER_FRAME][i], 0);

  if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
-      update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
+    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
                        counts->switchable_interp[i],
                        pre_fc->switchable_interp_prob[i],
                        fc->switchable_interp_prob[i], 0);
@@ -440,14 +440,12 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
                                             branch_ct_8x8p[j]);

-      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
-                                       branch_ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
      for (j = 0; j < TX_SIZES - 2; ++j)
        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
                                               branch_ct_16x16p[j]);

-      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
-                                       branch_ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
      for (j = 0; j < TX_SIZES - 1; ++j)
        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
                                               branch_ct_32x32p[j]);
@@ -472,14 +470,14 @@ static void set_default_lf_deltas(struct loopfilter *lf) {
  lf->mode_deltas[1] = 0;
 }

-void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
+void vp9_setup_past_independence(VP9_COMMON *cm) {
  // Reset the segment feature data to the default stats:
  // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &xd->lf;
+  struct loopfilter *const lf = &cm->lf;

  int i;
-  vp9_clearall_segfeatures(&xd->seg);
-  xd->seg.abs_delta = SEGMENT_DELTADATA;
+  vp9_clearall_segfeatures(&cm->seg);
+  cm->seg.abs_delta = SEGMENT_DELTADATA;
  if (cm->last_frame_seg_map)
    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));

@@ -512,10 +510,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
-
  vp9_update_mode_info_border(cm, cm->prev_mip);
-  vp9_update_mode_info_in_image(cm, cm->prev_mi);

  vp9_zero(cm->ref_frame_sign_bias);

--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -16,8 +16,8 @@

 #define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
-#define VP9_MODE_UPDATE_PROB  252
-#define VP9_SWITCHABLE_FILTERS 3   // number of switchable filters
+#define MODE_UPDATE_PROB  252
+#define SWITCHABLE_FILTERS 3   // number of switchable filters

 // #define MODE_STATS

@@ -35,32 +35,32 @@ struct tx_counts {
  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };

-extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES]
-                                        [VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+                                        [INTRA_MODES - 1];

 extern const vp9_tree_index vp9_intra_mode_tree[];
 extern const vp9_tree_index vp9_inter_mode_tree[];

-extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];

 // probability models for partition information
 extern const vp9_tree_index vp9_partition_tree[];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

 extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+                 [2 * (SWITCHABLE_FILTERS - 1)];

-extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];

 void vp9_entropy_mode_init();

-void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
+void vp9_setup_past_independence(struct VP9Common *cm);

-void vp9_init_mbmode_probs(struct VP9Common *x);
+void vp9_init_mbmode_probs(struct VP9Common *cm);

-void vp9_adapt_mode_probs(struct VP9Common *);
+void vp9_adapt_mode_probs(struct VP9Common *cm);

 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]);
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -79,20 +79,59 @@ static const nmv_context default_nmv_context = {

 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)

+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
  MV_CLASS_TYPE c = MV_CLASS_0;
-  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
-  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
-  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
-  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
-  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
-  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
-  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
-  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
-  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
-  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
-  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
-  else assert(0);
+  if (z >= CLASS0_SIZE * 4096)
+    c = MV_CLASS_10;
+  else
+    c = log_in_base_2[z >> 3];
+
  if (offset)
    *offset = z - mv_class_base(c);
  return c;
@@ -110,8 +149,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                             int incr, int usehp) {
  int s, z, c, o, d, e, f;
-  if (!incr)
-    return;
  assert (v != 0);            /* should not be zero */
  s = v < 0;
  comp_counts->sign[s] += incr;
@@ -123,61 +160,39 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
  d = (o >> 3);               /* int mv data */
  f = (o >> 1) & 3;           /* fractional pel mv data */
  e = (o & 1);                /* high precision mv data */
+
  if (c == MV_CLASS_0) {
    comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    comp_counts->class0_hp[e] += usehp * incr;
  } else {
    int i;
    int b = c + CLASS0_BITS - 1;  // number of bits
    for (i = 0; i < b; ++i)
      comp_counts->bits[i][((d >> i) & 1)] += incr;
-  }
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    comp_counts->class0_fp[d][f] += incr;
-  } else {
    comp_counts->fp[f] += incr;
-  }
-
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      comp_counts->class0_hp[e] += incr;
-    } else {
-      comp_counts->hp[e] += incr;
-    }
+    comp_counts->hp[e] += usehp * incr;
  }
 }

-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
-  int v;
-  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
-  for (v = 1; v <= MV_MAX; v++) {
-    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
-  }
-}

 void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
  const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
  ++counts->joints[j];

-  if (mv_joint_vertical(j))
-    ++counts->comps[0].mvcount[MV_MAX + mv->row];
+  if (mv_joint_vertical(j)) {
+    inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+  }

-  if (mv_joint_horizontal(j))
-    ++counts->comps[1].mvcount[MV_MAX + mv->col];
+  if (mv_joint_horizontal(j)) {
+    inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+  }
 }

 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }

-void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
-  counts_to_context(&nmv_count->comps[0], usehp);
-  counts_to_context(&nmv_count->comps[1], usehp);
-}
-
 static unsigned int adapt_probs(unsigned int i,
                                vp9_tree tree,
                                vp9_prob this_probs[],
@@ -207,8 +222,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
  nmv_context *pre_ctx = &pre_fc->nmvc;
  nmv_context_counts *cts = &cm->counts.mv;

-  vp9_counts_process(cts, allow_hp);
-
  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);

  for (i = 0; i < 2; ++i) {
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -24,7 +24,7 @@ void vp9_init_mv_probs(struct VP9Common *cm);
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
 int vp9_use_mv_hp(const MV *ref);

-#define VP9_NMV_UPDATE_PROB  252
+#define NMV_UPDATE_PROB  252

 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS     4
@@ -126,6 +126,4 @@ typedef struct {

 void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);

-void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
-
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -13,15 +13,16 @@

 #include "./vpx_config.h"

-#define LOG2_MI_SIZE 3
-#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE)  // 64 = 2^6
+#define MI_SIZE_LOG2 3
+#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6

-#define MI_SIZE (1 << LOG2_MI_SIZE)  // pixels per mi-unit
-#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE)  // mi-units per max block
+#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block

 #define MI_MASK (MI_BLOCK_SIZE - 1)

-typedef enum BLOCK_SIZE_TYPE {
+
+typedef enum BLOCK_SIZE {
  BLOCK_4X4,
  BLOCK_4X8,
  BLOCK_8X4,
@@ -35,15 +36,17 @@ typedef enum BLOCK_SIZE_TYPE {
  BLOCK_32X64,
  BLOCK_64X32,
  BLOCK_64X64,
-  BLOCK_SIZE_TYPES
-} BLOCK_SIZE_TYPE;
+  BLOCK_SIZES,
+  BLOCK_INVALID = BLOCK_SIZES
+} BLOCK_SIZE;

 typedef enum PARTITION_TYPE {
  PARTITION_NONE,
  PARTITION_HORZ,
  PARTITION_VERT,
  PARTITION_SPLIT,
-  PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
+  PARTITION_TYPES,
+  PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;

 #define PARTITION_PLOFFSET   4  // number of probability models per block size
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c
@@ -57,15 +57,23 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,

 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                               YV12_BUFFER_CONFIG *dst) {
-  const int et_y = dst->border;
-  const int el_y = dst->border;
-  const int eb_y = dst->border + dst->y_height - src->y_height;
-  const int er_y = dst->border + dst->y_width - src->y_width;
-
-  const int et_uv = dst->border >> (dst->uv_height != dst->y_height);
-  const int el_uv = dst->border >> (dst->uv_width != dst->y_width);
-  const int eb_uv = et_uv + dst->uv_height - src->uv_height;
-  const int er_uv = el_uv + dst->uv_width - src->uv_width;
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 mulitple
+  // or up to 16, whichever is greater.
+  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
+                       16);
+  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
+                       16);
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;

 #if CONFIG_ALPHA
  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -8,14 +8,12 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
-#include <stdlib.h>
-#include "vp9/common/vp9_filter.h"
 #include "vpx_ports/mem.h"
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"

-DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
+#include "vp9/common/vp9_filter.h"
+
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  { 0, 0, 0, 128,   0, 0, 0, 0 },
  { 0, 0, 0, 120,   8, 0, 0, 0 },
  { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -34,8 +32,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
  { 0, 0, 0,   8, 120, 0, 0, 0 }
 };

-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
-  /* Lagrangian interpolation filter */
+// Lagrangian interpolation filter
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  { 0,   0,   0, 128,   0,   0,   0,  0},
  { 0,   1,  -5, 126,   8,  -3,   1,  0},
  { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -54,9 +53,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
  { 0,   1,  -3,   8, 126,  -5,   1,  0}
 };

-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
-    = {
-  /* dct based filter */
+// DCT based filter
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  {0,   0,   0, 128,   0,   0,   0, 0},
  {-1,   3,  -7, 127,   8,  -3,   1, 0},
  {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -75,9 +74,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
  {0,   1,  -3,   8, 127,  -7,   3, -1}
 };

+// freqmultiplier = 0.5
 DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
-  /* freqmultiplier = 0.5 */
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  { 0,  0,  0, 128,  0,  0,  0,  0},
  {-3, -1, 32,  64, 38,  1, -3,  0},
  {-2, -2, 29,  63, 41,  2, -3,  0},
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -12,26 +12,22 @@
 #define VP9_COMMON_VP9_FILTER_H_

 #include "vpx_config.h"
-#include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"

-#define BLOCK_HEIGHT_WIDTH 4
-#define VP9_FILTER_WEIGHT 128
-#define VP9_FILTER_SHIFT  7
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8

-#define SUBPEL_SHIFTS 16
-
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS];

 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
-#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
-                   sizeof(vp9_bilinear_filters[0][0]))
-#define BF_OFFSET (BF_LENGTH / 2 - 1)
-#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)
+#define BILINEAR_FILTERS_2TAP(x) \
+  (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)

 #endif  // VP9_COMMON_VP9_FILTER_H_
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -8,11 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <limits.h>
-
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_sadmxn.h"

 static void lower_mv_precision(MV *mv, int allow_hp) {
  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
@@ -46,17 +43,14 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   int mi_row, int mi_col) {
  int_mv dst_list[MAX_MV_REF_CANDIDATES];
  int_mv mv_list[MAX_MV_REF_CANDIDATES];
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MODE_INFO *const mi = xd->this_mi;

  assert(ref_idx == 0 || ref_idx == 1);
  assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier

-  vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
-                       xd->prev_mode_info_context,
-                       mbmi->ref_frame[ref_idx],
-                       mv_list, cm->ref_frame_sign_bias, block_idx,
-                       mi_row, mi_col);
+  vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi,
+                       mi->mbmi.ref_frame[ref_idx],
+                       mv_list, block_idx, mi_row, mi_col);

  dst_list[1].as_int = 0;
  if (block_idx == 0) {
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -36,48 +36,57 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }

-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
                                   MACROBLOCKD *xd,
                                   int_mv *dst_nearest,
                                   int_mv *dst_near,
                                   int block_idx, int ref_idx,
                                   int mi_row, int mi_col);

-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
+                                          const MODE_INFO *left_mb, int b) {
  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
  // understand this condition. This will go away soon.
+  const MODE_INFO *mi = cur_mb;
+
  if (b == 0 || b == 2) {
    /* On L edge, get from MB to left of us */
-    --cur_mb;
-
-    if (is_inter_block(&cur_mb->mbmi)) {
+    mi = left_mb;
+    if (!mi)
      return DC_PRED;
-    } else if (cur_mb->mbmi.sb_type < BLOCK_8X8) {
-      return (cur_mb->bmi + 1 + b)->as_mode;
+
+    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+      return DC_PRED;
+    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
+      return ((mi->bmi + 1 + b)->as_mode);
    } else {
-      return cur_mb->mbmi.mode;
+      return mi->mbmi.mode;
    }
  }
  assert(b == 1 || b == 3);
-  return (cur_mb->bmi + b - 1)->as_mode;
+  return (mi->bmi + b - 1)->as_mode;
 }

 static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
-                                          int b, int mi_stride) {
+                                           const MODE_INFO *above_mb, int b) {
+  const MODE_INFO *mi = cur_mb;
+
  if (!(b >> 1)) {
    /* On top edge, get from MB above us */
-    cur_mb -= mi_stride;
-
-    if (is_inter_block(&cur_mb->mbmi)) {
+    mi = above_mb;
+    if (!mi)
      return DC_PRED;
-    } else if (cur_mb->mbmi.sb_type < BLOCK_8X8) {
-      return (cur_mb->bmi + 2 + b)->as_mode;
+
+    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+      return DC_PRED;
+    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
+      return ((mi->bmi + 2 + b)->as_mode);
    } else {
-      return cur_mb->mbmi.mode;
+      return mi->mbmi.mode;
    }
  }

-  return (cur_mb->bmi + b - 2)->as_mode;
+  return (mi->bmi + b - 2)->as_mode;
 }

 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -27,6 +27,9 @@
 #define pair_set_epi16(a, b) \
  _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))

+#define pair_set_epi32(a, b) \
+  _mm_set_epi32(b, a, b, a)
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -22,13 +22,217 @@ struct loop_filter_info {
  const uint8_t *hev_thr;
 };

+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t int_4x4_uv;
+} LOOP_FILTER_MASK;
+
+// 64 bit masks for left transform size.  Each 1 represents a position where
+// we should apply a loop filter across the left border of an 8x8 block
+// boundary.
+//
+// In the case of TX_16X16->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//
+// A loopfilter should be applied to every other 8x8 horizontally.
+static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
+    0xffffffffffffffff,  // TX_4X4
+    0xffffffffffffffff,  // TX_8x8
+    0x5555555555555555,  // TX_16x16
+    0x1111111111111111,  // TX_32x32
+};
+
+// 64 bit masks for above transform size.  Each 1 represents a position where
+// we should apply a loop filter across the top border of an 8x8 block
+// boundary.
+//
+// In the case of TX_32x32 ->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//
+// A loopfilter should be applied to every other 4 the row vertically.
+static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
+    0xffffffffffffffff,  // TX_4X4
+    0xffffffffffffffff,  // TX_8x8
+    0x00ff00ff00ff00ff,  // TX_16x16
+    0x000000ff000000ff,  // TX_32x32
+};
+
+// 64 bit masks for prediction sizes (left).  Each 1 represents a position
+// where left border of an 8x8 block.  These are aligned to the right most
+// appropriate bit,  and then shifted into place.
+//
+// In the case of TX_16x32 ->  ( low order byte first ) we end up with
+// a mask that looks like this :
+//
+//  10000000
+//  10000000
+//  10000000
+//  10000000
+//  00000000
+//  00000000
+//  00000000
+//  00000000
+static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4,
+    0x0000000000000001,  // BLOCK_4X8,
+    0x0000000000000001,  // BLOCK_8X4,
+    0x0000000000000001,  // BLOCK_8X8,
+    0x0000000000000101,  // BLOCK_8X16,
+    0x0000000000000001,  // BLOCK_16X8,
+    0x0000000000000101,  // BLOCK_16X16,
+    0x0000000001010101,  // BLOCK_16X32,
+    0x0000000000000101,  // BLOCK_32X16,
+    0x0000000001010101,  // BLOCK_32X32,
+    0x0101010101010101,  // BLOCK_32X64,
+    0x0000000001010101,  // BLOCK_64X32,
+    0x0101010101010101,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each prediction size.
+static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4
+    0x0000000000000001,  // BLOCK_4X8
+    0x0000000000000001,  // BLOCK_8X4
+    0x0000000000000001,  // BLOCK_8X8
+    0x0000000000000001,  // BLOCK_8X16,
+    0x0000000000000003,  // BLOCK_16X8
+    0x0000000000000003,  // BLOCK_16X16
+    0x0000000000000003,  // BLOCK_16X32,
+    0x000000000000000f,  // BLOCK_32X16,
+    0x000000000000000f,  // BLOCK_32X32,
+    0x000000000000000f,  // BLOCK_32X64,
+    0x00000000000000ff,  // BLOCK_64X32,
+    0x00000000000000ff,  // BLOCK_64X64
+};
+// 64 bit mask to shift and set for each prediction size.  A bit is set for
+// each 8x8 block that would be in the left most block of the given block
+// size in the 64x64 block.
+static const uint64_t size_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4
+    0x0000000000000001,  // BLOCK_4X8
+    0x0000000000000001,  // BLOCK_8X4
+    0x0000000000000001,  // BLOCK_8X8
+    0x0000000000000101,  // BLOCK_8X16,
+    0x0000000000000003,  // BLOCK_16X8
+    0x0000000000000303,  // BLOCK_16X16
+    0x0000000003030303,  // BLOCK_16X32,
+    0x0000000000000f0f,  // BLOCK_32X16,
+    0x000000000f0f0f0f,  // BLOCK_32X32,
+    0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
+    0x00000000ffffffff,  // BLOCK_64X32,
+    0xffffffffffffffff,  // BLOCK_64X64
+};
+
+// These are used for masking the left and above borders.
+static const uint64_t left_border =  0x1111111111111111;
+static const uint64_t above_border = 0x000000ff000000ff;
+
+// 16 bit masks for uv transform sizes.
+static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
+    0xffff,  // TX_4X4
+    0xffff,  // TX_8x8
+    0x5555,  // TX_16x16
+    0x1111,  // TX_32x32
+};
+
+static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
+    0xffff,  // TX_4X4
+    0xffff,  // TX_8x8
+    0x0f0f,  // TX_16x16
+    0x000f,  // TX_32x32
+};
+
+// 16 bit left mask to shift and set for each uv prediction size.
+static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4,
+    0x0001,  // BLOCK_4X8,
+    0x0001,  // BLOCK_8X4,
+    0x0001,  // BLOCK_8X8,
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8,
+    0x0001,  // BLOCK_16X16,
+    0x0011,  // BLOCK_16X32,
+    0x0001,  // BLOCK_32X16,
+    0x0011,  // BLOCK_32X32,
+    0x1111,  // BLOCK_32X64
+    0x0011,  // BLOCK_64X32,
+    0x1111,  // BLOCK_64X64
+};
+// 16 bit above mask to shift and set for uv each prediction size.
+static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4
+    0x0001,  // BLOCK_4X8
+    0x0001,  // BLOCK_8X4
+    0x0001,  // BLOCK_8X8
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8
+    0x0001,  // BLOCK_16X16
+    0x0001,  // BLOCK_16X32,
+    0x0003,  // BLOCK_32X16,
+    0x0003,  // BLOCK_32X32,
+    0x0003,  // BLOCK_32X64,
+    0x000f,  // BLOCK_64X32,
+    0x000f,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each uv prediction size
+static const uint16_t size_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4
+    0x0001,  // BLOCK_4X8
+    0x0001,  // BLOCK_8X4
+    0x0001,  // BLOCK_8X8
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8
+    0x0001,  // BLOCK_16X16
+    0x0011,  // BLOCK_16X32,
+    0x0003,  // BLOCK_32X16,
+    0x0033,  // BLOCK_32X32,
+    0x3333,  // BLOCK_32X64,
+    0x00ff,  // BLOCK_64X32,
+    0xffff,  // BLOCK_64X64
+};
+static const uint16_t left_border_uv =  0x1111;
+static const uint16_t above_border_uv = 0x000f;
+
+
 static void lf_init_lut(loop_filter_info_n *lfi) {
  lfi->mode_lf_lut[DC_PRED] = 0;
  lfi->mode_lf_lut[D45_PRED] = 0;
  lfi->mode_lf_lut[D135_PRED] = 0;
  lfi->mode_lf_lut[D117_PRED] = 0;
  lfi->mode_lf_lut[D153_PRED] = 0;
-  lfi->mode_lf_lut[D27_PRED] = 0;
+  lfi->mode_lf_lut[D207_PRED] = 0;
  lfi->mode_lf_lut[D63_PRED] = 0;
  lfi->mode_lf_lut[V_PRED] = 0;
  lfi->mode_lf_lut[H_PRED] = 0;
@@ -39,7 +243,7 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
  lfi->mode_lf_lut[NEWMV] = 1;
 }

-static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
  int lvl;

  // For each possible value for the loop filter fill out limits
@@ -61,8 +265,9 @@ static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
  }
 }

-void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
+void vp9_loop_filter_init(VP9_COMMON *cm) {
  loop_filter_info_n *lfi = &cm->lf_info;
+  struct loopfilter *lf = &cm->lf;
  int i;

  // init limits for given sharpness
@@ -77,16 +282,15 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
 }

-void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                int default_filt_lvl) {
+void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
  int seg_id;
  // n_shift is the a multiplier for lf_deltas
  // the multiplier is 1 for when filter_lvl is between 0 and 31;
  // 2 when filter_lvl is between 32 and 63
  const int n_shift = default_filt_lvl >> 5;
  loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *const lf = &xd->lf;
-  struct segmentation *const seg = &xd->seg;
+  struct loopfilter *const lf = &cm->lf;
+  struct segmentation *const seg = &cm->seg;

  // update limits if sharpness has changed
  if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -98,7 +302,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;

    // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) {
+    if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
                  ? data
@@ -108,7 +312,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
    if (!lf->mode_ref_delta_enabled) {
      // we could get rid of this if we assume that deltas are set to
      // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
+      vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
      continue;
    }

@@ -124,9 +328,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  }
 }

-static int build_lfi(const loop_filter_info_n *const lfi_n,
-                     const MB_MODE_INFO *const mbmi,
-                     struct loop_filter_info *const lfi) {
+static int build_lfi(const loop_filter_info_n *lfi_n,
+                     const MB_MODE_INFO *mbmi,
+                     struct loop_filter_info *lfi) {
  const int seg = mbmi->segment_id;
  const int ref = mbmi->ref_frame[0];
  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
@@ -236,10 +440,360 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
  }
 }

-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               const MODE_INFO *mi,
-                               int mi_row, int mi_col) {
+// This function ors into the current lfm structure, where to do loop
+// filters for the specific mi we are looking at.   It uses information
+// including the block_size_type (32x16, 32x32, etc),  the transform size,
+// whether there were any coefficients encoded, and the loop filter strength
+// block we are currently looking at. Shift is used to position the
+// 1's we produce.
+// TODO(JBB) Need another function for different resolution color..
+static void build_masks(const loop_filter_info_n *const lfi_n,
+                        const MODE_INFO *mi, const int shift_y,
+                        const int shift_uv,
+                        LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
+  const int skip = mi->mbmi.skip_coeff;
+  const int seg = mi->mbmi.segment_id;
+  const int ref = mi->mbmi.ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+  uint64_t *left_y = &lfm->left_y[tx_size_y];
+  uint64_t *above_y = &lfm->above_y[tx_size_y];
+  uint64_t *int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level)
+    return;
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16,   we'll set :
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and v set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (skip && ref > INTRA_FRAME)
+    return;
+
+  // Here we are adding a mask for the transform size.  The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4) {
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+  }
+  if (tx_size_uv == TX_4X4) {
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+  }
+}
+
+// This function does the same thing as the one above with the exception that
+// it only affects the y masks.   It exists because for blocks < 16x16 in size,
+// we only update u and v masks on the first block.
+static void build_y_mask(const loop_filter_info_n *const lfi_n,
+                         const MODE_INFO *mi, const int shift_y,
+                         LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+  const int skip = mi->mbmi.skip_coeff;
+  const int seg = mi->mbmi.segment_id;
+  const int ref = mi->mbmi.ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+  uint64_t *left_y = &lfm->left_y[tx_size_y];
+  uint64_t *above_y = &lfm->above_y[tx_size_y];
+  uint64_t *int_4x4_y = &lfm->int_4x4_y;
+
+  if (!filter_level)
+    return;
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (skip && ref > INTRA_FRAME)
+    return;
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4) {
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+  }
+}
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+// TODO(JBB): This function only works for yv12.
+static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+                       MODE_INFO **mi_8x8, const int mode_info_stride,
+                       LOOP_FILTER_MASK *lfm) {
+  int idx_32, idx_16, idx_8;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  MODE_INFO **mip = mi_8x8;
+  MODE_INFO **mip2 = mi_8x8;
+
+  // These are offsets to the next mi in the 64x64 block. It is what gets
+  // added to the mi ptr as we go through each loop.  It helps us to avoids
+  // setting up special row and column counters for each index.  The last step
+  // brings us out back to the starting position.
+  const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
+                           -(mode_info_stride << 2) - 4};
+  const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
+                           -(mode_info_stride << 1) - 2};
+  const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
+
+  // Following variables represent shifts to position the current block
+  // mask over the appropriate block.   A shift of 36 to the left will move
+  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
+  // 4 rows to the appropriate spot.
+  const int shift_32_y[] = {0, 4, 32, 36};
+  const int shift_16_y[] = {0, 2, 16, 18};
+  const int shift_8_y[] = {0, 1, 8, 9};
+  const int shift_32_uv[] = {0, 2, 8, 10};
+  const int shift_16_uv[] = {0, 1, 4, 5};
+  int i;
+  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
+                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
+  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
+                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+
+  vp9_zero(*lfm);
+
+  // TODO(jimbankoski): Try moving most of the following code into decode
+  // loop and storing lfm in the mbmi structure so that we don't have to go
+  // through the recursive loop structure multiple times.
+  switch (mip[0]->mbmi.sb_type) {
+    case BLOCK_64X64:
+      build_masks(lfi_n, mip[0] , 0, 0, lfm);
+      break;
+    case BLOCK_64X32:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + mode_info_stride * 4;
+      if (4 >= max_rows)
+        break;
+      build_masks(lfi_n, mip2[0], 32, 8, lfm);
+      break;
+    case BLOCK_32X64:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + 4;
+      if (4 >= max_cols)
+        break;
+      build_masks(lfi_n, mip2[0], 4, 2, lfm);
+      break;
+    default:
+      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
+        const int shift_y = shift_32_y[idx_32];
+        const int shift_uv = shift_32_uv[idx_32];
+        const int mi_32_col_offset = ((idx_32 & 1) << 2);
+        const int mi_32_row_offset = ((idx_32 >> 1) << 2);
+        if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
+          continue;
+        switch (mip[0]->mbmi.sb_type) {
+          case BLOCK_32X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            break;
+          case BLOCK_32X16:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_row_offset + 2 >= max_rows)
+              continue;
+            mip2 = mip + mode_info_stride * 2;
+            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            break;
+          case BLOCK_16X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_col_offset + 2 >= max_cols)
+              continue;
+            mip2 = mip + 2;
+            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            break;
+          default:
+            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
+              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
+              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int mi_16_col_offset = mi_32_col_offset +
+                  ((idx_16 & 1) << 1);
+              const int mi_16_row_offset = mi_32_row_offset +
+                  ((idx_16 >> 1) << 1);
+
+              if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
+                continue;
+
+              switch (mip[0]->mbmi.sb_type) {
+                case BLOCK_16X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  break;
+                case BLOCK_16X8:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_row_offset + 1 >= max_rows)
+                    continue;
+                  mip2 = mip + mode_info_stride;
+                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+                  break;
+                case BLOCK_8X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_col_offset +1 >= max_cols)
+                    continue;
+                  mip2 = mip + 1;
+                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+                  break;
+                default: {
+                  const int shift_y = shift_32_y[idx_32] +
+                                      shift_16_y[idx_16] +
+                                      shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  mip += offset[0];
+                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
+                    const int shift_y = shift_32_y[idx_32] +
+                                        shift_16_y[idx_16] +
+                                        shift_8_y[idx_8];
+                    const int mi_8_col_offset = mi_16_col_offset +
+                        ((idx_8 & 1));
+                    const int mi_8_row_offset = mi_16_row_offset +
+                        ((idx_8 >> 1));
+
+                    if (mi_8_col_offset >= max_cols ||
+                        mi_8_row_offset >= max_rows)
+                      continue;
+                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                  }
+                  break;
+                }
+              }
+            }
+            break;
+        }
+      }
+      break;
+  }
+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+  // for 32x32 transforms also also.
+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+  // We do at least 8 tap filter on every 32x32 even if the transform size
+  // is 4x4.  So if the 4x4 is set on a border pixel add it to the 8x8 and
+  // remove it from the 4x4.
+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+  lfm->left_y[TX_4X4] &= ~left_border;
+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+  lfm->above_y[TX_4X4] &= ~above_border;
+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+  lfm->left_uv[TX_4X4] &= ~left_border_uv;
+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+  lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+  // We do some special edge handling.
+  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+    const uint64_t rows = cm->mi_rows - mi_row;
+
+    // Each pixel inside the border gets a 1,
+    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
+    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+
+    // Remove values completely outside our border.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv;
+
+    // We don't apply a wide loop filter on the last uv block row.  If set
+    // apply the shorter one instead.
+    if (rows == 1) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+      lfm->above_uv[TX_16X16] = 0;
+    }
+    if (rows == 5) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+    }
+  }
+
+  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+    const uint64_t columns = cm->mi_cols - mi_col;
+
+    // Each pixel inside the border gets a 1, the multiply copies the border
+    // to where we need it.
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+    // Internal edges are not applied on the last column of the image so
+    // we mask 1 more for the internal edges
+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+    // Remove the bits outside the image edge.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv_int;
+
+    // We don't apply a wide loop filter on the last uv column.  If set
+    // apply the shorter one instead.
+    if (columns == 1) {
+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+      lfm->left_uv[TX_16X16] = 0;
+    }
+    if (columns == 5) {
+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+    }
+  }
+  // We don't a loop filter on the first column in the image.  Mask that out.
+  if (mi_col == 0) {
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefe;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+}
+#if CONFIG_NON420
+static void filter_block_plane_non420(VP9_COMMON *cm,
+                                      struct macroblockd_plane *plane,
+                                      MODE_INFO **mi_8x8,
+                                      int mi_row, int mi_col) {
  const int ss_x = plane->subsampling_x;
  const int ss_y = plane->subsampling_y;
  const int row_step = 1 << ss_x;
@@ -262,24 +816,25 @@ static void filter_block_plane(VP9_COMMON *const cm,

    // Determine the vertical edges that need filtering
    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const int skip_this = mi[c].mbmi.mb_skip_coeff
-                            && is_inter_block(&mi[c].mbmi);
+      const MODE_INFO *mi = mi_8x8[c];
+      const int skip_this = mi[0].mbmi.skip_coeff
+                            && is_inter_block(&mi[0].mbmi);
      // left edge of current unit is block/partition edge -> no skip
-      const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
-          !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ?
+          !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
      const int skip_this_c = skip_this && !block_edge_left;
      // top edge of current unit is block/partition edge -> no skip
-      const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?
-          !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ?
+          !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
      const int skip_this_r = skip_this && !block_edge_above;
      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[c].mbmi)
-                            : mi[c].mbmi.txfm_size;
+                            ? get_uv_tx_size(&mi[0].mbmi)
+                            : mi[0].mbmi.tx_size;
      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

      // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
        continue;

      // Build masks based on the transform size of each block
@@ -338,7 +893,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
                            mask_4x4_c & border_mask,
                            mask_4x4_int[r], lfi[r]);
    dst->buf += 8 * dst->stride;
-    mi += row_step_stride;
+    mi_8x8 += row_step_stride;
  }

  // Now do horizontal pass
@@ -355,33 +910,146 @@ static void filter_block_plane(VP9_COMMON *const cm,
    dst->buf += 8 * dst->stride;
  }
 }
+#endif
+
+static void filter_block_plane(VP9_COMMON *const cm,
+                               struct macroblockd_plane *const plane,
+                               MODE_INFO **mi_8x8,
+                               int mi_row, int mi_col,
+                               LOOP_FILTER_MASK *lfm) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_x;
+  const int col_step = 1 << ss_y;
+  const int row_step_stride = cm->mode_info_stride * row_step;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t* const dst0 = dst->buf;
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  int r, c;
+  int row_shift = 3 - ss_x;
+  int row_mask = 0xff >> (ss_x << 2);
+
+#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    int r_sampled = r >> ss_x;
+
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO *mi = mi_8x8[c];
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+        continue;
+    }
+    if (!plane->plane_type) {
+      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
+      // Disable filtering on the leftmost column
+      filter_selectively_vert(dst->buf, dst->stride,
+                              MASK_ROW(lfm->left_y[TX_16X16]),
+                              MASK_ROW(lfm->left_y[TX_8X8]),
+                              MASK_ROW(lfm->left_y[TX_4X4]),
+                              MASK_ROW(lfm->int_4x4_y),
+                              lfi[r]);
+    } else {
+      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
+      // Disable filtering on the leftmost column
+      filter_selectively_vert(dst->buf, dst->stride,
+                              MASK_ROW(lfm->left_uv[TX_16X16]),
+                              MASK_ROW(lfm->left_uv[TX_8X8]),
+                              MASK_ROW(lfm->left_uv[TX_4X4]),
+                              MASK_ROW(lfm->int_4x4_uv),
+                              lfi[r]);
+    }
+    dst->buf += 8 * dst->stride;
+    mi_8x8 += row_step_stride;
+  }
+
+  // Now do horizontal pass
+  dst->buf = dst0;
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+    int r_sampled = r >> ss_x;
+
+    if (!plane->plane_type) {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               MASK_ROW(lfm->above_y[TX_16X16]),
+                               MASK_ROW(lfm->above_y[TX_8X8]),
+                               MASK_ROW(lfm->above_y[TX_4X4]),
+                               MASK_ROW(lfm->int_4x4_y),
+                               mi_row + r == 0, lfi[r]);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               MASK_ROW(lfm->above_uv[TX_16X16]),
+                               MASK_ROW(lfm->above_uv[TX_8X8]),
+                               MASK_ROW(lfm->above_uv[TX_4X4]),
+                               mask_4x4_int_r,
+                               mi_row + r == 0, lfi[r]);
+    }
+    dst->buf += 8 * dst->stride;
+  }
+#undef MASK_ROW
+}

 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
                          VP9_COMMON *cm, MACROBLOCKD *xd,
                          int start, int stop, int y_only) {
  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
  int mi_row, mi_col;
+  LOOP_FILTER_MASK lfm;
+#if CONFIG_NON420
+  int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
+      xd->plane[1].subsampling_x == 1);
+#endif

  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
-    MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
+    MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;

    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
      int plane;

      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+#if CONFIG_NON420
+      if (use_420)
+#endif
+        setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
+                   &lfm);
+
      for (plane = 0; plane < num_planes; ++plane) {
-        filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
+#if CONFIG_NON420
+        if (use_420)
+#endif
+          filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row,
+                             mi_col, &lfm);
+#if CONFIG_NON420
+        else
+          filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
+                                    mi_row, mi_col);
+#endif
      }
    }
  }
 }

 void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
-                           int frame_filter_level, int y_only) {
+                           int frame_filter_level,
+                           int y_only, int partial) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
  if (!frame_filter_level) return;
-  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
  vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
-                       0, cm->mi_rows, y_only);
+                       start_mi_row, end_mi_row,
+                       y_only);
 }

 int vp9_loop_filter_worker(void *arg1, void *arg2) {
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -22,6 +22,27 @@

 #define SIMD_WIDTH 16

+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      2
+
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_LF_DELTAS];
+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
@@ -39,19 +60,17 @@ typedef struct {
 struct VP9Common;
 struct macroblockd;

-void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf);
+void vp9_loop_filter_init(struct VP9Common *cm);

 // Update the loop filter for the current frame.
 // This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
 // calls this function directly.
-void vp9_loop_filter_frame_init(struct VP9Common *const cm,
-                                struct macroblockd *const xd,
-                                int default_filt_lvl);
+void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);

 void vp9_loop_filter_frame(struct VP9Common *cm,
                           struct macroblockd *mbd,
                           int filter_level,
-                           int y_only);
+                           int y_only, int partial);

 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -1,3 +1,4 @@
+
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
@@ -36,7 +37,7 @@ static const int mode_2_counter[MB_MODE_COUNT] = {
  9,  // D135_PRED
  9,  // D117_PRED
  9,  // D153_PRED
-  9,  // D27_PRED
+  9,  // D207_PRED
  9,  // D63_PRED
  9,  // TM_PRED
  0,  // NEARESTMV
@@ -70,33 +71,33 @@ static const int counter_to_context[19] = {
  BOTH_INTRA  // 18
 };

-static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
+static const MV mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
  // 4X4
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
  // 4X8
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
  // 8X4
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
  // 8X8
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
  // 8X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // 16X8
  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
  // 16X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
  // 16X32
-  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X16
  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
  // 32X32
-  {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
  // 32X64
-  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // 64X32
  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
  // 64X64
-  {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
 };

 static const int idx_n_column_to_subblock[4][2] = {
@@ -121,78 +122,75 @@ static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
 static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
                                      int check_sub_blocks, int which_mv,
                                      int search_col, int block_idx) {
-  return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
+  return check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv]);
+          : candidate->mbmi.mv[which_mv];
 }


 // Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv,
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
                              const MV_REFERENCE_FRAME this_ref_frame,
                              const int *ref_sign_bias) {
-  int_mv return_mv = candidate->mbmi.mv[which_mv];
-
-  // Sign inversion where appropriate.
-  if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] !=
-      ref_sign_bias[this_ref_frame]) {
-    return_mv.as_mv.row *= -1;
-    return_mv.as_mv.col *= -1;
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
  }
-  return return_mv;
+  return mv;
 }

 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
 // skip all additional processing and jump to done!
 #define ADD_MV_REF_LIST(MV) \
-  if (refmv_count) { \
-    if ((MV).as_int != mv_ref_list[0].as_int) { \
-      mv_ref_list[refmv_count] = (MV); \
-      goto Done; \
+  do { \
+    if (refmv_count) { \
+      if ((MV).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (MV); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_list[refmv_count++] = (MV); \
    } \
-  } else { \
-    mv_ref_list[refmv_count++] = (MV); \
-  }
+  } while (0)

 // If either reference frame is different, not INTRA, and they
 // are different from each other scale and add the mv to our list.
 #define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
-  if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \
-    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
-  } \
-  if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \
-      (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \
-      (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \
-    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
-  }
+  do { \
+    if ((CANDIDATE)->ref_frame[0] != ref_frame) \
+      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
+    if ((CANDIDATE)->ref_frame[1] != ref_frame && \
+        has_second_ref(CANDIDATE) && \
+        (CANDIDATE)->mv[1].as_int != (CANDIDATE)->mv[0].as_int) \
+      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
+  } while (0)
+

 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
-static INLINE int is_inside(int mi_col, int mi_row, int cur_tile_mi_col_start,
-                            const int mv_ref[2]) {
-  // Check that the candidate is within the border.  We only need to check
-  // the left side because all the positive right side ones are for blocks that
-  // are large enough to support the + value they have within their border.
-  return !(mi_row + mv_ref[1] < 0 ||
-           mi_col + mv_ref[0] < cur_tile_mi_col_start);
+static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row,
+                            const MV *mv) {
+  return !(mi_row + mv->row < 0 ||
+           mi_col + mv->col < cm->cur_tile_mi_col_start ||
+           mi_row + mv->row >= cm->mi_rows ||
+           mi_col + mv->col >= cm->cur_tile_mi_col_end);
 }

 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
-                          const MODE_INFO *lf_here,
-                          const MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list, const int *ref_sign_bias,
-                          const int block_idx,
-                          const int mi_row, const int mi_col) {
-  int idx;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int refmv_count = 0;
-  const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
-  const MODE_INFO *candidate;
-  const int check_sub_blocks = block_idx >= 0;
+void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          MODE_INFO *mi, const MODE_INFO *prev_mi,
+                          MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list,
+                          int block_idx,
+                          int mi_row, int mi_col) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+  const MV *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
  int different_ref_found = 0;
  int context_counter = 0;

@@ -202,28 +200,27 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
  // The nearest 2 blocks are treated differently
  // if the size < 8x8 we get the mv from the bmi substructure,
  // and we also need to keep a mode count.
-  for (idx = 0; idx < 2; ++idx) {
-    const int *mv_ref = mv_ref_search[idx];
+  for (i = 0; i < 2; ++i) {
+    const MV *const mv_ref = &mv_ref_search[i];
+    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+      const int check_sub_blocks = block_idx >= 0;
+      const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
+                                                   * xd->mode_info_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];

-    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
-      continue;
-
-    candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
-
-    // Keep counts for entropy encoding.
-    context_counter += mode_2_counter[candidate->mbmi.mode];
-
-    // Check if the candidate comes from the same reference frame.
-    if (candidate->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0,
-                                       mv_ref[0], block_idx));
-      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
-    } else {
-      different_ref_found = 1;
-      if (candidate->mbmi.ref_frame[1] == ref_frame) {
-        // Add second motion vector if it has the same ref_frame.
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1,
-                                         mv_ref[0], block_idx));
+      // Check if the candidate comes from the same reference frame.
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 0,
+                                         mv_ref->col, block_idx));
+        different_ref_found = candidate->ref_frame[1] != ref_frame;
+      } else {
+        if (candidate->ref_frame[1] == ref_frame)
+          // Add second motion vector if it has the same ref_frame.
+          ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 1,
+                                           mv_ref->col, block_idx));
+        different_ref_found = 1;
      }
    }
  }
@@ -231,65 +228,59 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
  // Check the rest of the neighbors in much the same way
  // as before except we don't need to keep track of sub blocks or
  // mode counts.
-  for (; idx < MVREF_NEIGHBOURS; ++idx) {
-    const int *mv_ref = mv_ref_search[idx];
-    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
-      continue;
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const MV *const mv_ref = &mv_ref_search[i];
+    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+      const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
+                                            mv_ref->row
+                                            * xd->mode_info_stride]->mbmi;

-    candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
-
-    if (candidate->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(candidate->mbmi.mv[0]);
-      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
-    } else {
-      different_ref_found = 1;
-      if (candidate->mbmi.ref_frame[1] == ref_frame) {
-        ADD_MV_REF_LIST(candidate->mbmi.mv[1]);
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(candidate->mv[0]);
+        different_ref_found = candidate->ref_frame[1] != ref_frame;
+      } else {
+        if (candidate->ref_frame[1] == ref_frame)
+          ADD_MV_REF_LIST(candidate->mv[1]);
+        different_ref_found = 1;
      }
    }
  }

  // Check the last frame's mode and mv info.
-  if (lf_here != NULL) {
-    if (lf_here->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(lf_here->mbmi.mv[0]);
-    } else if (lf_here->mbmi.ref_frame[1] == ref_frame) {
-      ADD_MV_REF_LIST(lf_here->mbmi.mv[1]);
-    }
+  if (prev_mbmi) {
+    if (prev_mbmi->ref_frame[0] == ref_frame)
+      ADD_MV_REF_LIST(prev_mbmi->mv[0]);
+    else if (prev_mbmi->ref_frame[1] == ref_frame)
+      ADD_MV_REF_LIST(prev_mbmi->mv[1]);
  }

  // Since we couldn't find 2 mvs from the same reference frame
  // go back through the neighbors and find motion vectors from
  // different reference frames.
  if (different_ref_found) {
-    for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) {
-      const int *mv_ref = mv_ref_search[idx];
-      if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
-        continue;
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const MV *mv_ref = &mv_ref_search[i];
+      if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+        const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
+                                                          mv_ref->row
+                                              * xd->mode_info_stride]->mbmi;

-      candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
-
-      // If the candidate is INTRA we don't want to consider its mv.
-      if (!is_inter_block(&candidate->mbmi))
-        continue;
-
-      IF_DIFF_REF_FRAME_ADD_MV(candidate);
+        // If the candidate is INTRA we don't want to consider its mv.
+        if (is_inter_block(candidate))
+          IF_DIFF_REF_FRAME_ADD_MV(candidate);
+      }
    }
  }

  // Since we still don't have a candidate we'll try the last frame.
-  if (lf_here != NULL && is_inter_block(&lf_here->mbmi)) {
-    IF_DIFF_REF_FRAME_ADD_MV(lf_here);
-  }
+  if (prev_mbmi && is_inter_block(prev_mbmi))
+    IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi);

 Done:

-  mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter];
+  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];

  // Clamp vectors
-  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
-    clamp_mv_ref(&mv_ref_list[idx].as_mv, xd);
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
 }
-
-#undef ADD_MV_REF_LIST
-#undef IF_DIFF_REF_FRAME_ADD_MV
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -14,27 +14,20 @@
 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
 #define VP9_COMMON_VP9_MVREF_COMMON_H_

-void vp9_find_mv_refs_idx(VP9_COMMON *cm,
-                          MACROBLOCKD *xd,
-                          MODE_INFO *here,
-                          const MODE_INFO *lf_here,
-                          const MV_REFERENCE_FRAME ref_frame,
+void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          MODE_INFO *mi, const MODE_INFO *prev_mi,
+                          MV_REFERENCE_FRAME ref_frame,
                          int_mv *mv_ref_list,
-                          const int *ref_sign_bias,
-                          const int block_idx,
-                          const int mi_row,
-                          const int mi_col);
+                          int block_idx,
+                          int mi_row, int mi_col);

-static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
-                                    MACROBLOCKD *xd,
-                                    MODE_INFO *here,
-                                    MODE_INFO *lf_here,
+static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
                                    MV_REFERENCE_FRAME ref_frame,
                                    int_mv *mv_ref_list,
-                                    int *ref_sign_bias,
                                    int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
-                       mv_ref_list, ref_sign_bias, -1, mi_row, mi_col);
+  vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame,
+                       mv_ref_list, -1, mi_row, mi_col);
 }

 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -46,7 +46,8 @@ extern "C"
  typedef enum {
    USAGE_STREAM_FROM_SERVER    = 0x0,
    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-    USAGE_CONSTRAINED_QUALITY   = 0x2
+    USAGE_CONSTRAINED_QUALITY   = 0x2,
+    USAGE_CONSTANT_QUALITY      = 0x3,
  } END_USAGE;


@@ -130,6 +131,8 @@ extern "C"
    // END DATARATE CONTROL OPTIONS
    // ----------------------------------------------------------------

+    // Spatial scalability
+    int ss_number_layers;

    // these parameters aren't to be used in final build don't use!!!
    int play_alternate;
@@ -210,6 +213,13 @@ extern "C"
  int vp9_set_internal_size(VP9_PTR comp,
                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);

+  int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
+                           unsigned int height);
+
+  int vp9_switch_layer(VP9_PTR comp, int layer);
+
+  void vp9_set_svc(VP9_PTR comp, int use_svc);
+
  int vp9_get_quantizer(VP9_PTR c);

 #ifdef __cplusplus
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -20,7 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"

-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif

@@ -38,14 +38,14 @@
 #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)

 typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
  vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
                         [PARTITION_TYPES - 1];
  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-                                 [VP9_SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+                                 [SWITCHABLE_FILTERS - 1];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
  vp9_prob single_ref_prob[REF_CONTEXTS][2];
@@ -56,15 +56,15 @@ typedef struct frame_contexts {
 } FRAME_CONTEXT;

 typedef struct {
-  unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
-  unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
  unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
  vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
  unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                         [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
-                                [VP9_SWITCHABLE_FILTERS];
-  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES];
+  unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+                                [SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
  unsigned int single_ref[REF_CONTEXTS][2][2];
@@ -164,6 +164,10 @@ typedef struct VP9Common {
  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */

+  MODE_INFO **mi_grid_base;
+  MODE_INFO **mi_grid_visible;
+  MODE_INFO **prev_mi_grid_base;
+  MODE_INFO **prev_mi_grid_visible;

  // Persistent mb segment id map used in prediction.
  unsigned char *last_frame_seg_map;
@@ -176,6 +180,9 @@ typedef struct VP9Common {

  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */

+  struct loopfilter lf;
+  struct segmentation seg;
+
  /* Y,U,V */
  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
@@ -198,7 +205,7 @@ typedef struct VP9Common {
  unsigned int current_video_frame;
  int version;

-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
  struct postproc_state  postproc_state;
 #endif

@@ -231,7 +238,19 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
 }

 static int mi_cols_aligned_to_sb(int n_mis) {
-  return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE);
+  return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
+}
+
+static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col) {
+  const int above_idx = mi_col * 2;
+  const int left_idx = (mi_row * 2) & 15;
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x);
+    pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y);
+  }
 }

 static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -240,25 +259,20 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
  xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
 }

-static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col,
-                                BLOCK_SIZE_TYPE bsize) {
-  int bsl = mi_width_log2(bsize), bs = 1 << bsl;
-  int ms = bs / 2;
+// return the node index in the prob tree for binary coding
+static int check_bsize_coverage(int bs, int mi_rows, int mi_cols,
+                                int mi_row, int mi_col) {
+  const int r = (mi_row + bs < mi_rows);
+  const int c = (mi_col + bs < mi_cols);

-  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms < cm->mi_cols))
+  if (r && c)
    return 0;

-  // frame width/height are multiples of 8, hence 8x8 block should always
-  // pass the above check
-  assert(bsize > BLOCK_8X8);
+  if (c && !r)
+    return 1;  // only allow horizontal/split partition types

-  // return the node index in the prob tree for binary coding
-  // only allow horizontal/split partition types
-  if ((mi_col + ms < cm->mi_cols) && (mi_row + ms >= cm->mi_rows))
-    return 1;
-  // only allow vertical/split partition types
-  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms >= cm->mi_cols))
-    return 2;
+  if (r && !c)
+    return 2;  // only allow vertical/split partition types

  return -1;
 }
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -53,7 +53,7 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
  { RGB_TO_YUV(0xCC33FF) },   /* Magenta */
 };

-static const unsigned char B_PREDICTION_MODE_colors[VP9_INTRA_MODES][3] = {
+static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = {
  { RGB_TO_YUV(0x6633ff) },   /* Purple */
  { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
  { RGB_TO_YUV(0xff33cc) },   /* Pink */
@@ -630,23 +630,21 @@ static void constrain_line(int x0, int *x1, int y0, int *y1,
  }
 }

-int vp9_post_proc_frame(struct VP9Common *oci,
-                        struct loopfilter *lf,
-                        YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *ppflags) {
-  int q = lf->filter_level * 10 / 6;
+int vp9_post_proc_frame(struct VP9Common *cm,
+                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
+  int q = cm->lf.filter_level * 10 / 6;
  int flags = ppflags->post_proc_flag;
  int deblock_level = ppflags->deblocking_level;
  int noise_level = ppflags->noise_level;

-  if (!oci->frame_to_show)
+  if (!cm->frame_to_show)
    return -1;

  if (q > 63)
    q = 63;

  if (!flags) {
-    *dest = *oci->frame_to_show;
+    *dest = *cm->frame_to_show;
    return 0;
  }

@@ -655,52 +653,52 @@ int vp9_post_proc_frame(struct VP9Common *oci,
 #endif

  if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
+    deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
                               q + (deblock_level - 5) * 10, 1, 0);
  } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
+    vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
  } else {
-    vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+    vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
  }

  if (flags & VP9D_ADDNOISE) {
-    if (oci->postproc_state.last_q != q
-        || oci->postproc_state.last_noise != noise_level) {
-      fillrd(&oci->postproc_state, 63 - q, noise_level);
+    if (cm->postproc_state.last_q != q
+        || cm->postproc_state.last_noise != noise_level) {
+      fillrd(&cm->postproc_state, 63 - q, noise_level);
    }

-    vp9_plane_add_noise(oci->post_proc_buffer.y_buffer,
-                        oci->postproc_state.noise,
-                        oci->postproc_state.blackclamp,
-                        oci->postproc_state.whiteclamp,
-                        oci->postproc_state.bothclamp,
-                        oci->post_proc_buffer.y_width,
-                        oci->post_proc_buffer.y_height,
-                        oci->post_proc_buffer.y_stride);
+    vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
+                        cm->postproc_state.noise,
+                        cm->postproc_state.blackclamp,
+                        cm->postproc_state.whiteclamp,
+                        cm->postproc_state.bothclamp,
+                        cm->post_proc_buffer.y_width,
+                        cm->post_proc_buffer.y_height,
+                        cm->post_proc_buffer.y_stride);
  }

 #if 0 && CONFIG_POSTPROC_VISUALIZER
  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
    char message[512];
    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (oci->frame_type == KEY_FRAME),
-            oci->refresh_golden_frame,
-            oci->base_qindex,
-            oci->filter_level,
+            (cm->frame_type == KEY_FRAME),
+            cm->refresh_golden_frame,
+            cm->base_qindex,
+            cm->filter_level,
            flags,
-            oci->mb_cols, oci->mb_rows);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
+            cm->mb_cols, cm->mb_rows);
+    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+                  cm->post_proc_buffer.y_stride);
  }

  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
    int i, j;
    uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
    int mb_rows = post->y_height >> 4;
    int mb_cols = post->y_width  >> 4;
    int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
+    MODE_INFO *mi = cm->mi;

    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

@@ -725,11 +723,11 @@ int vp9_post_proc_frame(struct VP9Common *oci,
  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
    int i, j;
    uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
    int mb_rows = post->y_height >> 4;
    int mb_cols = post->y_width  >> 4;
    int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
+    MODE_INFO *mi = cm->mi;

    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

@@ -739,9 +737,9 @@ int vp9_post_proc_frame(struct VP9Common *oci,
        char zz[4];
        int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
                        mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.mb_skip_coeff);
+                        mi[mb_index].mbmi.skip_coeff);

-        if (oci->frame_type == KEY_FRAME)
+        if (cm->frame_type == KEY_FRAME)
          sprintf(zz, "a");
        else
          sprintf(zz, "%c", dc_diff + '0');
@@ -761,19 +759,19 @@ int vp9_post_proc_frame(struct VP9Common *oci,
    char message[512];
    snprintf(message, sizeof(message),
             "Bitrate: %10.2f framerate: %10.2f ",
-             oci->bitrate, oci->framerate);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
+             cm->bitrate, cm->framerate);
+    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+                  cm->post_proc_buffer.y_stride);
  }

  /* Draw motion vectors */
  if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    uint8_t *y_buffer = oci->post_proc_buffer.y_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;
    int x0, y0;

    for (y0 = 0; y0 < height; y0 += 16) {
@@ -882,7 +880,7 @@ int vp9_post_proc_frame(struct VP9Common *oci,
              }
            }
          }
-        } else if (mi->mbmi.mode >= NEARESTMV) {
+        } else if (is_inter_mode(mi->mbmi.mode)) {
          MV *mv = &mi->mbmi.mv.as_mv;
          const int lx0 = x0 + 8;
          const int ly0 = y0 + 8;
@@ -910,14 +908,14 @@ int vp9_post_proc_frame(struct VP9Common *oci,
  if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
      && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
    int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;

    for (y = 0; y < height; y += 16) {
      for (x = 0; x < width; x += 16) {
@@ -975,14 +973,14 @@ int vp9_post_proc_frame(struct VP9Common *oci,
  if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
      ppflags->display_ref_frame_flag) {
    int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;

    for (y = 0; y < height; y += 16) {
      for (x = 0; x < width; x += 16) {
@@ -1008,12 +1006,13 @@ int vp9_post_proc_frame(struct VP9Common *oci,
  }
 #endif

-  *dest = oci->post_proc_buffer;
+  *dest = cm->post_proc_buffer;

  /* handle problem with extending borders */
-  dest->y_width = oci->width;
-  dest->y_height = oci->height;
-  dest->uv_height = dest->y_height / 2;
+  dest->y_width = cm->width;
+  dest->y_height = cm->height;
+  dest->uv_width = dest->y_width >> cm->subsampling_x;
+  dest->uv_height = dest->y_height >> cm->subsampling_y;

  return 0;
 }
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -26,7 +26,7 @@ struct postproc_state {
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"

-int vp9_post_proc_frame(struct VP9Common *oci, struct loopfilter *lf,
+int vp9_post_proc_frame(struct VP9Common *cm,
                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);

 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -18,48 +18,49 @@

 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  // left
-  const int left_mv_pred = is_inter_mode(left_mbmi->mode);
+  const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode)
+                                         : 0;
  const int left_interp = left_in_image && left_mv_pred
-                              ? left_mbmi->interp_filter
-                              : VP9_SWITCHABLE_FILTERS;
+                              ? left_mi->mbmi.interp_filter
+                              : SWITCHABLE_FILTERS;

  // above
-  const int above_mv_pred = is_inter_mode(above_mbmi->mode);
+  const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode)
+                                           : 0;
  const int above_interp = above_in_image && above_mv_pred
-                               ? above_mbmi->interp_filter
-                               : VP9_SWITCHABLE_FILTERS;
-
+                               ? above_mi->mbmi.interp_filter
+                               : SWITCHABLE_FILTERS;

  if (left_interp == above_interp)
    return left_interp;
-  else if (left_interp == VP9_SWITCHABLE_FILTERS &&
-           above_interp != VP9_SWITCHABLE_FILTERS)
+  else if (left_interp == SWITCHABLE_FILTERS &&
+           above_interp != SWITCHABLE_FILTERS)
    return above_interp;
-  else if (left_interp != VP9_SWITCHABLE_FILTERS &&
-           above_interp == VP9_SWITCHABLE_FILTERS)
+  else if (left_interp != SWITCHABLE_FILTERS &&
+           above_interp == SWITCHABLE_FILTERS)
    return left_interp;
  else
-    return VP9_SWITCHABLE_FILTERS;
+    return SWITCHABLE_FILTERS;
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
-  const int left_intra = !is_inter_block(left_mbmi);
-  const int above_intra = !is_inter_block(above_mbmi);
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;

  // The mode info data structure has a one element border above and to the
  // left of the entries corresponding to real macroblocks.
@@ -80,35 +81,35 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
 unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
-        left_mbmi->ref_frame[1] <= INTRA_FRAME)
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
      // neither edge uses comp pred (0/1)
      pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
                     (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
-    else if (above_mbmi->ref_frame[1] <= INTRA_FRAME)
+    else if (!has_second_ref(above_mbmi))
      // one of two edges uses comp pred (2/3)
      pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          above_mbmi->ref_frame[0] == INTRA_FRAME);
-    else if (left_mbmi->ref_frame[1] <= INTRA_FRAME)
+                          !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
      // one of two edges uses comp pred (2/3)
      pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          left_mbmi->ref_frame[0] == INTRA_FRAME);
+                          !is_inter_block(left_mbmi));
    else  // both edges use comp pred (4)
      pred_context = 4;
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;

-    if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+    if (!has_second_ref(edge_mbmi))
      // edge does not use comp pred (0/1)
      pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
    else
@@ -125,11 +126,14 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
 unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                              const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-cm->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
@@ -138,22 +142,19 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
  const int var_ref_idx = !fix_ref_idx;

  if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
-        left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/intra (2)
+    if (above_intra && left_intra) {  // intra/intra (2)
      pred_context = 2;
-    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
-               left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
-                                          left_mbmi : above_mbmi;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;

-      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)  // single pred (1/3)
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
      else  // comp pred (1/3)
        pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
                                    != cm->comp_var_ref[1]);
    } else {  // inter/inter
-      int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME;
-      int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME;
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
      MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
                                     : above_mbmi->ref_frame[var_ref_idx];
      MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
@@ -187,13 +188,15 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;

-    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+    if (!is_inter_block(edge_mbmi)) {
      pred_context = 2;
-    else if (edge_mbmi->ref_frame[1] > INTRA_FRAME)
-      pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
                              != cm->comp_var_ref[1]);
-    else
-      pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+      else
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+    }
  } else {  // no edges available (2)
    pred_context = 2;
  }
@@ -203,91 +206,91 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 }
 unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
-        left_mbmi->ref_frame[0] == INTRA_FRAME) {
+    if (above_intra && left_intra) {  // intra/intra
      pred_context = 2;
-    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
-               left_mbmi->ref_frame[0] == INTRA_FRAME) {
-      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
-                                          left_mbmi : above_mbmi;
-
-      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi))
        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
      else
        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
-               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
-      pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
-                     2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
-    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
-               left_mbmi->ref_frame[1] > INTRA_FRAME) {
-      pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                          above_mbmi->ref_frame[1] == LAST_FRAME ||
-                          left_mbmi->ref_frame[0] == LAST_FRAME ||
-                          left_mbmi->ref_frame[1] == LAST_FRAME);
-    } else {
-      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+    } else {  // inter/inter
+      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
+        pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
+                       2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
+        pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                            above_mbmi->ref_frame[1] == LAST_FRAME ||
+                            left_mbmi->ref_frame[0] == LAST_FRAME ||
+                            left_mbmi->ref_frame[1] == LAST_FRAME);
+      } else {
+        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];

-      if (rfs == LAST_FRAME)
-        pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      else
-        pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
+      }
    }
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
-
-    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+    if (!is_inter_block(edge_mbmi)) {  // intra
      pred_context = 2;
-    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-    else
-      pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                          edge_mbmi->ref_frame[1] == LAST_FRAME);
-  } else {  // no edges available (2)
+    } else {  // inter
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+    }
+  } else {  // no edges available
    pred_context = 2;
  }
+
  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  return pred_context;
 }

 unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;

  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
-        left_mbmi->ref_frame[0] == INTRA_FRAME) {
+    if (above_intra && left_intra) {  // intra/intra
      pred_context = 2;
-    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
-               left_mbmi->ref_frame[0] == INTRA_FRAME) {
-      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
-                                          left_mbmi : above_mbmi;
-
-      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) {
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
          pred_context = 3;
        else
@@ -296,54 +299,53 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
                                edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
      }
-    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
-               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
-      if (above_mbmi->ref_frame[0] == LAST_FRAME &&
-          left_mbmi->ref_frame[0] == LAST_FRAME) {
-        pred_context = 3;
-      } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                 left_mbmi->ref_frame[0] == LAST_FRAME) {
-        const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == LAST_FRAME ?
-                                           left_mbmi : above_mbmi;
+    } else {  // inter/inter
+      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
+        if (above_mbmi->ref_frame[0] == LAST_FRAME &&
+            left_mbmi->ref_frame[0] == LAST_FRAME) {
+          pred_context = 3;
+        } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                   left_mbmi->ref_frame[0] == LAST_FRAME) {
+          const MB_MODE_INFO *edge_mbmi =
+              above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi;

-        pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+        } else {
+          pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
+                         2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
+        }
+      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
+        if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
+            above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
+          pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                              above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
+                              left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                              left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+        else
+          pred_context = 2;
      } else {
-        pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
-                       2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
-      }
-    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
-               left_mbmi->ref_frame[1] > INTRA_FRAME) {
-      if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
-          above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
-        pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                            above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
-                            left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                            left_mbmi->ref_frame[1] == GOLDEN_FRAME);
-      else
-        pred_context = 2;
-    } else {
-      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];

-      if (rfs == GOLDEN_FRAME)
-        pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-      else if (rfs == ALTREF_FRAME)
-        pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
-      else
-        pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        if (rfs == GOLDEN_FRAME)
+          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+        else
+          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      }
    }
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;

-    if (edge_mbmi->ref_frame[0] == INTRA_FRAME ||
-        (edge_mbmi->ref_frame[0] == LAST_FRAME &&
-         edge_mbmi->ref_frame[1] <= INTRA_FRAME))
+    if (!is_inter_block(edge_mbmi) ||
+        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
      pred_context = 2;
-    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+    else if (!has_second_ref(edge_mbmi))
      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
    else
      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
@@ -359,22 +361,23 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
-  const int max_tx_size = max_txsize_lookup[mi->mbmi.sb_type];
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type];
  int above_context = max_tx_size;
  int left_context = max_tx_size;

  if (above_in_image)
-    above_context = above_mbmi->mb_skip_coeff ? max_tx_size
-                                              : above_mbmi->txfm_size;
+    above_context = above_mbmi->skip_coeff ? max_tx_size
+                                           : above_mbmi->tx_size;

  if (left_in_image)
-    left_context = left_mbmi->mb_skip_coeff ? max_tx_size
-                                            : left_mbmi->txfm_size;
+    left_context = left_mbmi->skip_coeff ? max_tx_size
+                                         : left_mbmi->tx_size;

  if (!left_in_image)
    left_context = above_context;
@@ -385,36 +388,17 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
  return above_context + left_context > max_tx_size;
 }

-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag) {
-  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int x, y;
-
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
-      mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag;
+void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
+  xd->this_mi->mbmi.seg_id_predicted = pred_flag;
 }

-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag) {
-  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int x, y;
-
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
-      mi[y * cm->mode_info_stride + x].mbmi.mb_skip_coeff = pred_flag;
+void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                              uint8_t pred_flag) {
+  xd->this_mi->mbmi.skip_coeff = pred_flag;
 }

 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
+                       BLOCK_SIZE bsize, int mi_row, int mi_col) {
  const int mi_offset = mi_row * cm->mi_cols + mi_col;
  const int bw = 1 << mi_width_log2(bsize);
  const int bh = 1 << mi_height_log2(bsize);
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -15,32 +15,32 @@
 #include "vp9/common/vp9_onyxc_int.h"

 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);
+                       BLOCK_SIZE bsize, int mi_row, int mi_col);


 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const int above_sip = above_mi ? above_mi->mbmi.seg_id_predicted : 0;
+  const int left_sip = left_mi ? left_mi->mbmi.seg_id_predicted : 0;

-  return above_mbmi->seg_id_predicted +
-             (xd->left_available ? left_mbmi->seg_id_predicted : 0);
+  return above_sip + (xd->left_available ? left_sip : 0);
 }

-static INLINE vp9_prob vp9_get_pred_prob_seg_id(const MACROBLOCKD *xd) {
-  return xd->seg.pred_probs[vp9_get_pred_context_seg_id(xd)];
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
+                                                const MACROBLOCKD *xd) {
+  return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }

-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag);
+void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag);

 static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const int above_skip_coeff = above_mi ? above_mi->mbmi.skip_coeff : 0;
+  const int left_skip_coeff = left_mi ? left_mi->mbmi.skip_coeff : 0;

-  return above_mbmi->mb_skip_coeff +
-             (xd->left_available ? left_mbmi->mb_skip_coeff : 0);
+  return above_skip_coeff + (xd->left_available ? left_skip_coeff : 0);
 }

 static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
@@ -49,11 +49,11 @@ static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
 }

 static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
-  return xd->mode_info_context->mbmi.mb_skip_coeff;
+  return xd->this_mi->mbmi.skip_coeff;
 }

-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag);
+void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                              uint8_t pred_flag);

 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);

@@ -102,7 +102,7 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,

 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);

-static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
                                    const struct tx_probs *tx_probs) {
  if (bsize < BLOCK_16X16)
    return tx_probs->p8x8[context];
@@ -113,13 +113,14 @@ static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
 }

 static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs) {
-  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+                                     const struct tx_probs *tx_probs,
+                                     const MODE_INFO *m) {
+  const BLOCK_SIZE bsize = m->mbmi.sb_type;
  const int context = vp9_get_pred_context_tx_size(xd);
  return get_tx_probs(bsize, context, tx_probs);
 }

-static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
                             TX_SIZE tx_size, struct tx_counts *tx_counts) {
  if (bsize >= BLOCK_32X32)
    tx_counts->p32x32[context][tx_size]++;
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -130,12 +130,12 @@ int16_t vp9_ac_quant(int qindex, int delta) {
 }


-int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {
-  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_ALT_Q);
-    return xd->seg.abs_delta == SEGMENT_ABSDATA ?
-               data :  // Abs value
-               clamp(base_qindex + data, 0, MAXQ);  // Delta value
+int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) {
+  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+    return seg->abs_delta == SEGMENT_ABSDATA ?
+                             data :  // Abs value
+                             clamp(base_qindex + data, 0, MAXQ);  // Delta value
  } else {
    return base_qindex;
  }
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -23,6 +23,6 @@ void vp9_init_quant_tables();
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);

-int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex);
+int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex);

 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -10,171 +10,27 @@

 #include <assert.h>

+#include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
+
 #include "vpx/vpx_integer.h"
+
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "./vpx_scale_rtcd.h"

-static int scale_value_x_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
-}
-
-static int scale_value_y_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
-}
-
-static int unscaled_value(int val, const struct scale_factors *scale) {
-  (void) scale;
-  return val;
-}
-
-static MV32 mv_q3_to_q4_with_scaling(const MV *mv,
-                                     const struct scale_factors *scale) {
-  const MV32 res = {
-    ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
-        + scale->y_offset_q4,
-    ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
-        + scale->x_offset_q4
-  };
-  return res;
-}
-
-static MV32 mv_q3_to_q4_without_scaling(const MV *mv,
-                                        const struct scale_factors *scale) {
-  const MV32 res = {
-     mv->row << 1,
-     mv->col << 1
-  };
-  return res;
-}
-
-static MV32 mv_q4_with_scaling(const MV *mv,
-                               const struct scale_factors *scale) {
-  const MV32 res = {
-    (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
-    (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
-  };
-  return res;
-}
-
-static MV32 mv_q4_without_scaling(const MV *mv,
-                                  const struct scale_factors *scale) {
-  const MV32 res = {
-    mv->row,
-    mv->col
-  };
-  return res;
-}
-
-static void set_offsets_with_scaling(struct scale_factors *scale,
-                                     int row, int col) {
-  const int x_q4 = 16 * col;
-  const int y_q4 = 16 * row;
-
-  scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
-  scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
-}
-
-static void set_offsets_without_scaling(struct scale_factors *scale,
-                                        int row, int col) {
-  scale->x_offset_q4 = 0;
-  scale->y_offset_q4 = 0;
-}
-
-static int get_fixed_point_scale_factor(int other_size, int this_size) {
-  // Calculate scaling factor once for each reference frame
-  // and use fixed point scaling factors in decoding and encoding routines.
-  // Hardware implementations can calculate scale factor in device driver
-  // and use multiplication and shifting on hardware instead of division.
-  return (other_size << VP9_REF_SCALE_SHIFT) / this_size;
-}
-
-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h) {
-  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
-  scale->x_offset_q4 = 0;  // calculated per-mb
-  scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
-
-  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
-  scale->y_offset_q4 = 0;  // calculated per-mb
-  scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
-
-  if ((other_w == this_w) && (other_h == this_h)) {
-    scale->scale_value_x = unscaled_value;
-    scale->scale_value_y = unscaled_value;
-    scale->set_scaled_offsets = set_offsets_without_scaling;
-    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;
-    scale->scale_mv_q4 = mv_q4_without_scaling;
-  } else {
-    scale->scale_value_x = scale_value_x_with_scaling;
-    scale->scale_value_y = scale_value_y_with_scaling;
-    scale->set_scaled_offsets = set_offsets_with_scaling;
-    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;
-    scale->scale_mv_q4 = mv_q4_with_scaling;
-  }
-
-  // TODO(agrange): Investigate the best choice of functions to use here
-  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
-  // to do at full-pel offsets. The current selection, where the filter is
-  // applied in one direction only, and not at all for 0,0, seems to give the
-  // best quality, but it may be worth trying an additional mode that does
-  // do the filtering on full-pel.
-  if (scale->x_step_q4 == 16) {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in either direction.
-      scale->predict[0][0][0] = vp9_convolve_copy;
-      scale->predict[0][0][1] = vp9_convolve_avg;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
-    } else {
-      // No scaling in x direction. Must always scale in the y direction.
-      scale->predict[0][0][0] = vp9_convolve8_vert;
-      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
-    }
-  } else {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in the y direction. Must always scale in the x direction.
-      scale->predict[0][0][0] = vp9_convolve8_horiz;
-      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
-    } else {
-      // Must always scale in both directions.
-      scale->predict[0][0][0] = vp9_convolve8;
-      scale->predict[0][0][1] = vp9_convolve8_avg;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
-    }
-  }
-  // 2D subpel motion always gets filtered in both directions
-  scale->predict[1][1][0] = vp9_convolve8;
-  scale->predict[1][1][1] = vp9_convolve8_avg;
-}

 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
                              VP9_COMMON *cm) {
-  if (xd->mode_info_context) {
-    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  if (xd->mi_8x8 && xd->this_mi) {
+    MB_MODE_INFO * mbmi = &xd->this_mi->mbmi;

    set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
                      cm->active_ref_scale);
+  } else {
+    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
  }

  switch (mcomp_filter_type) {
@@ -199,17 +55,18 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const MV *src_mv,
                               const struct scale_factors *scale,
-                               int w, int h, int weight,
+                               int w, int h, int ref,
                               const struct subpix_fn_table *subpix,
                               enum mv_precision precision) {
-  const MV32 mv = precision == MV_PRECISION_Q4
-                     ? scale->scale_mv_q4(src_mv, scale)
-                     : scale->scale_mv_q3_to_q4(src_mv, scale);
-  const int subpel_x = mv.col & 15;
-  const int subpel_y = mv.row & 15;
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row << 1,
+                     is_q4 ? src_mv->col : src_mv->col << 1 };
+  const MV32 mv = scale->scale_mv(&mv_q4, scale);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;

-  src += (mv.row >> 4) * src_stride + (mv.col >> 4);
-  scale->predict[!!subpel_x][!!subpel_y][weight](
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+  scale->predict[subpel_x != 0][subpel_y != 0][ref](
      src, src_stride, dst, dst_stride,
      subpix->filter_x[subpel_x], scale->x_step_q4,
      subpix->filter_y[subpel_y], scale->y_step_q4,
@@ -232,20 +89,16 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
  return res;
 }

-
-
 // TODO(jkoleszar): yet another mv clamping function :-(
-MV clamp_mv_to_umv_border_sb(const MV *src_mv,
-    int bwl, int bhl, int ss_x, int ss_y,
-    int mb_to_left_edge, int mb_to_top_edge,
-    int mb_to_right_edge, int mb_to_bottom_edge) {
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y) {
  // If the MV points so far into the UMV border that no visible pixels
  // are used for reconstruction, the subpel part of the MV can be
  // discarded and the MV limited to 16 pixels with equivalent results.
-  const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
-  const int spel_right = spel_left - (1 << 4);
-  const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
-  const int spel_bottom = spel_top - (1 << 4);
+  const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
  MV clamped_mv = {
    src_mv->row << (1 - ss_y),
    src_mv->col << (1 - ss_x)
@@ -253,130 +106,143 @@ MV clamp_mv_to_umv_border_sb(const MV *src_mv,
  assert(ss_x <= 1);
  assert(ss_y <= 1);

-  clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left,
-                        (mb_to_right_edge << (1 - ss_x)) + spel_right,
-                        (mb_to_top_edge << (1 - ss_y)) - spel_top,
-                        (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+  clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left,
+                        (xd->mb_to_right_edge << (1 - ss_x)) + spel_right,
+                        (xd->mb_to_top_edge << (1 - ss_y)) - spel_top,
+                        (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);

  return clamped_mv;
 }

 struct build_inter_predictors_args {
  MACROBLOCKD *xd;
-  int x;
-  int y;
-  uint8_t* dst[MAX_MB_PLANE];
-  int dst_stride[MAX_MB_PLANE];
-  uint8_t* pre[2][MAX_MB_PLANE];
-  int pre_stride[2][MAX_MB_PLANE];
+  int x, y;
 };
-static void build_inter_predictors(int plane, int block,
-                                   BLOCK_SIZE_TYPE bsize,
+
+static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
                                   int pred_w, int pred_h,
                                   void *argv) {
  const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD * const xd = arg->xd;
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-  const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
-  const MODE_INFO *const mi = xd->mode_info_context;
+  MACROBLOCKD *const xd = arg->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+  const int bw = 4 << bwl;
+  const int bh = plane_block_height(bsize, pd);
+  const int x = 4 * (block & ((1 << bwl) - 1));
+  const int y = 4 * (block >> bwl);
+  const MODE_INFO *mi = xd->this_mi;
  const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
-  int which_mv;
+  int ref;

-  assert(x < (4 << bwl));
-  assert(y < (4 << bhl));
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == (4 << bwl));
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == (4 << bhl));
+  assert(x < bw);
+  assert(y < bh);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);

-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    // source
-    const uint8_t * const base_pre = arg->pre[which_mv][plane];
-    const int pre_stride = arg->pre_stride[which_mv][plane];
-    const uint8_t *const pre = base_pre +
-        scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
-    struct scale_factors * const scale = &xd->scale_factor[which_mv];
+  for (ref = 0; ref < 1 + use_second_ref; ++ref) {
+    struct scale_factors *const scale = &xd->scale_factor[ref];
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;

-    // dest
-    uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
+    const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
+                               pre_buf->stride, scale);
+
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;

    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
    // same MV (the average of the 4 luma MVs) but we could do something
    // smarter for non-4:2:0. Just punt for now, pending the changes to get
    // rid of SPLITMV mode entirely.
    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv
-                             : mi_mv_pred_q4(mi, which_mv))
-               : mi->mbmi.mv[which_mv].as_mv;
+               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
+                             : mi_mv_pred_q4(mi, ref))
+               : mi->mbmi.mv[ref].as_mv;

    // TODO(jkoleszar): This clamping is done in the incorrect place for the
    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
    // MV. Note however that it performs the subsampling aware scaling so
    // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl,
-                                                xd->plane[plane].subsampling_x,
-                                                xd->plane[plane].subsampling_y,
-                                                xd->mb_to_left_edge,
-                                                xd->mb_to_top_edge,
-                                                xd->mb_to_right_edge,
-                                                xd->mb_to_bottom_edge);
+    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                                pd->subsampling_x,
+                                                pd->subsampling_y);
+
    scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-    vp9_build_inter_predictor(pre, pre_stride,
-                              dst, arg->dst_stride[plane],
-                              &res_mv, &xd->scale_factor[which_mv],
-                              4 << pred_w, 4 << pred_h, which_mv,
+    vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                              &res_mv, scale,
+                              4 << pred_w, 4 << pred_h, ref,
                              &xd->subpix, MV_PRECISION_Q4);
  }
 }
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    int mi_row,
-                                    int mi_col,
-                                    BLOCK_SIZE_TYPE bsize) {
-  struct build_inter_predictors_args args = {
-    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0},
-    {{xd->plane[0].pre[0].buf, NULL, NULL},
-     {xd->plane[0].pre[1].buf, NULL, NULL}},
-    {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}},
-  };

-  foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);
-}
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     int mi_row,
-                                     int mi_col,
-                                     BLOCK_SIZE_TYPE bsize) {
-  struct build_inter_predictors_args args = {
-    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-#if CONFIG_ALPHA
-    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-     xd->plane[3].dst.buf},
-    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride,
-     xd->plane[3].dst.stride},
-    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf,
-      xd->plane[3].pre[0].buf},
-     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf,
-      xd->plane[3].pre[1].buf}},
-    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride,
-      xd->plane[3].pre[0].stride},
-     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride,
-      xd->plane[3].pre[1].stride}},
-#else
-    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf},
-    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride},
-    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf},
-     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}},
-    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride},
-     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}},
-#endif
-  };
-  foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);
-}
-void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
-                                   int mi_row, int mi_col,
-                                   BLOCK_SIZE_TYPE bsize) {
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+typedef void (*foreach_predicted_block_visitor)(int plane, int block,
+                                                BLOCK_SIZE bsize,
+                                                int pred_w, int pred_h,
+                                                void *arg);
+static INLINE void foreach_predicted_block_in_plane(
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int i, x, y;

-  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-  vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
+  // block sizes in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // subsampled size of the block
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+  // size of the predictor to use.
+  int pred_w, pred_h;
+
+  if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) {
+    assert(bsize == BLOCK_8X8);
+    pred_w = 0;
+    pred_h = 0;
+  } else {
+    pred_w = bwl;
+    pred_h = bhl;
+  }
+  assert(pred_w <= bwl);
+  assert(pred_h <= bhl);
+
+  // visit each subblock in raster order
+  i = 0;
+  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
+    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
+      visit(plane, i, bsize, pred_w, pred_h, arg);
+      i += 1 << pred_w;
+    }
+    i += (1 << (bwl + pred_h)) - (1 << bwl);
+  }
+}
+
+static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col,
+                                              int plane_from, int plane_to) {
+  int plane;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    struct build_inter_predictors_args args = {
+      xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
+    };
+    foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
+                                     &args);
+  }
+}
+
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+}
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
+                                    MAX_MB_PLANE - 1);
+}
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
+                                    MAX_MB_PLANE - 1);
 }

 // TODO(dkovalev: find better place for this function)
@@ -391,8 +257,7 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
                                      fb->y_crop_width, fb->y_crop_height,
                                      cm->width, cm->height);

-    if (sf->x_scale_fp != VP9_REF_NO_SCALE ||
-        sf->y_scale_fp != VP9_REF_NO_SCALE)
+    if (vp9_is_scaled(sf))
      vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
  }
 }
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -15,28 +15,19 @@
 #include "vp9/common/vp9_onyxc_int.h"

 struct subpix_fn_table;
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    int mb_row,
-                                    int mb_col,
-                                    BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize);

-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     int mb_row,
-                                     int mb_col,
-                                     BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize);

-void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
-                                   int mb_row, int mb_col,
-                                   BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);

 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                              INTERPOLATIONFILTERTYPE filter,
                              VP9_COMMON *cm);

-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const MV *mv_q3,
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -8,15 +8,16 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <stdio.h>
-
 #include "./vpx_config.h"
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_onyxc_int.h"
+
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_once.h"

+#include "vp9_rtcd.h"
+
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
 const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
    DCT_DCT,    // DC
    ADST_DCT,   // V
@@ -25,7 +26,7 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
    ADST_ADST,  // D135
    ADST_DCT,   // D117
    DCT_ADST,   // D153
-    DCT_ADST,   // D27
+    DCT_ADST,   // D207
    ADST_DCT,   // D63
    ADST_ADST,  // TM
    DCT_DCT,    // NEARESTMV
@@ -35,294 +36,256 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
 };

 #define intra_pred_sized(type, size) \
-void vp9_##type##_predictor_##size##x##size##_c(uint8_t *pred_ptr, \
-                                                ptrdiff_t stride, \
-                                                uint8_t *above_row, \
-                                                uint8_t *left_col) { \
-  type##_predictor(pred_ptr, stride, size, above_row, left_col); \
-}
+  void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  const uint8_t *above, \
+                                                  const uint8_t *left) { \
+    type##_predictor(dst, stride, size, above, left); \
+  }
+
 #define intra_pred_allsizes(type) \
  intra_pred_sized(type, 4) \
  intra_pred_sized(type, 8) \
  intra_pred_sized(type, 16) \
  intra_pred_sized(type, 32)

-static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
  int r, c;
+
  // first column
-  for (r = 0; r < bs - 1; ++r) {
-      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
-                                                   left_col[r + 1], 1);
-  }
-  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
-  pred_ptr++;
+  for (r = 0; r < bs - 1; ++r)
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
  // second column
-  for (r = 0; r < bs - 2; ++r) {
-      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
-                                                   left_col[r + 1] * 2 +
-                                                   left_col[r + 2], 2);
-  }
-  pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] +
-                                                      left_col[bs - 1] * 3,
-                                                      2);
-  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
-  pred_ptr++;
+  for (r = 0; r < bs - 2; ++r)
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 +
+                                         left[r + 2], 2);
+  dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] +
+                                              left[bs - 1] * 3, 2);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;

  // rest of last row
-  for (c = 0; c < bs - 2; ++c) {
-    pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1];
-  }
+  for (c = 0; c < bs - 2; ++c)
+    dst[(bs - 1) * stride + c] = left[bs - 1];

-  for (r = bs - 2; r >= 0; --r) {
-    for (c = 0; c < bs - 2; ++c) {
-      pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2];
-    }
-  }
+  for (r = bs - 2; r >= 0; --r)
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
-intra_pred_allsizes(d27)
+intra_pred_allsizes(d207)

-static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
  int r, c;
  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      if (r & 1) {
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                         above_row[r/2 + c + 1] * 2 +
-                                         above_row[r/2 + c + 2], 2);
-      } else {
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                         above_row[r/2+ c + 1], 1);
-      }
-    }
-    pred_ptr += stride;
+    for (c = 0; c < bs; ++c)
+      dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1] * 2 +
+                                          above[r/2 + c + 2], 2)
+                     : ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1], 1);
+    dst += stride;
  }
 }
 intra_pred_allsizes(d63)

-static INLINE void d45_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
  int r, c;
  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      if (r + c + 2 < bs * 2)
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r + c] +
-                                         above_row[r + c + 1] * 2 +
-                                         above_row[r + c + 2], 2);
-      else
-        pred_ptr[c] = above_row[bs * 2 - 1];
-    }
-    pred_ptr += stride;
+    for (c = 0; c < bs; ++c)
+      dst[c] = r + c + 2 < bs * 2 ?  ROUND_POWER_OF_TWO(above[r + c] +
+                                                        above[r + c + 1] * 2 +
+                                                        above[r + c + 2], 2)
+                                  : above[bs * 2 - 1];
+    dst += stride;
  }
 }
 intra_pred_allsizes(d45)

-static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
  int r, c;
+
  // first row
  for (c = 0; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + above_row[c], 1);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1);
+  dst += stride;

  // second row
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
  for (c = 1; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
-                                     above_row[c - 1] * 2 +
-                                     above_row[c], 2);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
+  dst += stride;

  // the rest of first col
-  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                   left_col[0] * 2 +
-                                   left_col[1], 2);
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
  for (r = 3; r < bs; ++r)
-    pred_ptr[(r-2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] +
-                                                  left_col[r - 2] * 2 +
-                                                  left_col[r - 1], 2);
+    dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 +
+                                               left[r - 1], 2);
+
  // the rest of the block
  for (r = 2; r < bs; ++r) {
    for (c = 1; c < bs; c++)
-      pred_ptr[c] = pred_ptr[-2 * stride + c - 1];
-    pred_ptr += stride;
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
  }
 }
 intra_pred_allsizes(d117)

-static INLINE void d135_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
  int r, c;
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
  for (c = 1; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
-                                     above_row[c - 1] * 2 +
-                                     above_row[c], 2);
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);

-  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                        left_col[0] * 2 +
-                                        left_col[1], 2);
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
  for (r = 2; r < bs; ++r)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
-                                              left_col[r - 1] * 2 +
-                                              left_col[r], 2);
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);

-  pred_ptr += stride;
+  dst += stride;
  for (r = 1; r < bs; ++r) {
    for (c = 1; c < bs; c++)
-      pred_ptr[c] = pred_ptr[-stride + c - 1];
-    pred_ptr += stride;
+      dst[c] = dst[-stride + c - 1];
+    dst += stride;
  }
 }
 intra_pred_allsizes(d135)

-static INLINE void d153_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
  int r, c;
-  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + left_col[0], 1);
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1);
  for (r = 1; r < bs; r++)
-    pred_ptr[r * stride] =
-        ROUND_POWER_OF_TWO(left_col[r - 1] + left_col[r], 1);
-  pred_ptr++;
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1);
+  dst++;

-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
-  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                        left_col[0] * 2 +
-                                        left_col[1], 2);
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
  for (r = 2; r < bs; r++)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
-                                              left_col[r - 1] * 2 +
-                                              left_col[r], 2);
-  pred_ptr++;
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);
+  dst++;

  for (c = 0; c < bs - 2; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] +
-                                     above_row[c] * 2 +
-                                     above_row[c + 1], 2);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2);
+  dst += stride;
+
  for (r = 1; r < bs; ++r) {
    for (c = 0; c < bs - 2; c++)
-      pred_ptr[c] = pred_ptr[-stride + c - 2];
-    pred_ptr += stride;
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
  }
 }
 intra_pred_allsizes(d153)

-static INLINE void v_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                       uint8_t *above_row, uint8_t *left_col) {
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
  int r;

  for (r = 0; r < bs; r++) {
-    vpx_memcpy(pred_ptr, above_row, bs);
-    pred_ptr += stride;
+    vpx_memcpy(dst, above, bs);
+    dst += stride;
  }
 }
 intra_pred_allsizes(v)

-static INLINE void h_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                               uint8_t *above_row, uint8_t *left_col) {
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
  int r;

  for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, left_col[r], bs);
-    pred_ptr += stride;
+    vpx_memset(dst, left[r], bs);
+    dst += stride;
  }
 }
 intra_pred_allsizes(h)

-static INLINE void tm_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                uint8_t *above_row, uint8_t *left_col) {
+static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
  int r, c;
-  int ytop_left = above_row[-1];
+  int ytop_left = above[-1];

  for (r = 0; r < bs; r++) {
    for (c = 0; c < bs; c++)
-      pred_ptr[c] = clip_pixel(left_col[r] + above_row[c] - ytop_left);
-    pred_ptr += stride;
+      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
+    dst += stride;
  }
 }
 intra_pred_allsizes(tm)

-static INLINE void dc_128_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                    uint8_t *above_row, uint8_t *left_col) {
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
  int r;

  for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, 128, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, 128, bs);
+    dst += stride;
  }
 }
 intra_pred_allsizes(dc_128)

-static INLINE void dc_left_predictor(uint8_t *pred_ptr, ptrdiff_t stride,
-                                     int bs,
-                                     uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
-  const int count = bs;
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;

  for (i = 0; i < bs; i++)
-    average += left_col[i];
-  expected_dc = (average + (count >> 1)) / count;
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;

  for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
  }
 }
 intra_pred_allsizes(dc_left)

-static INLINE void dc_top_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                    uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
-  const int count = bs;
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;

  for (i = 0; i < bs; i++)
-    average += above_row[i];
-  expected_dc = (average + (count >> 1)) / count;
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;

  for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
  }
 }
 intra_pred_allsizes(dc_top)

-static INLINE void dc_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
  const int count = 2 * bs;

-  for (i = 0; i < bs; i++)
-    average += above_row[i];
-  for (i = 0; i < bs; i++)
-    average += left_col[i];
-  expected_dc = (average + (count >> 1)) / count;
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;

  for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
  }
 }
 intra_pred_allsizes(dc)
 #undef intra_pred_allsizes

-typedef void (*intra_pred_fn)(uint8_t *pred_ptr, ptrdiff_t stride,
-                              uint8_t *above_row, uint8_t *left_col);
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);

-static intra_pred_fn pred[VP9_INTRA_MODES][4];
+static intra_pred_fn pred[INTRA_MODES][4];
 static intra_pred_fn dc_pred[2][2][4];

 static void init_intra_pred_fn_ptrs(void) {
@@ -334,7 +297,7 @@ static void init_intra_pred_fn_ptrs(void) {

  intra_pred_allsizes(pred[V_PRED], v);
  intra_pred_allsizes(pred[H_PRED], h);
-  intra_pred_allsizes(pred[D27_PRED], d27);
+  intra_pred_allsizes(pred[D207_PRED], d207);
  intra_pred_allsizes(pred[D45_PRED], d45);
  intra_pred_allsizes(pred[D63_PRED], d63);
  intra_pred_allsizes(pred[D117_PRED], d117);
@@ -350,16 +313,17 @@ static void init_intra_pred_fn_ptrs(void) {
 #undef intra_pred_allsizes
 }

-static void build_intra_predictors(uint8_t *src, int src_stride,
-                                   uint8_t *pred_ptr, int stride,
-                                   MB_PREDICTION_MODE mode, TX_SIZE txsz,
+static void build_intra_predictors(const uint8_t *ref, int ref_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   MB_PREDICTION_MODE mode, TX_SIZE tx_size,
                                   int up_available, int left_available,
                                   int right_available) {
  int i;
  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, yabove_data, 128 + 16);
-  uint8_t *above_row = yabove_data + 16;
-  const int bs = 4 << txsz;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;

  // 127 127 127 .. 127 127 127 127 127 127
  // 129  A   B  ..  Y   Z
@@ -369,45 +333,46 @@ static void build_intra_predictors(uint8_t *src, int src_stride,
  // ..

  once(init_intra_pred_fn_ptrs);
+
+  // left
  if (left_available) {
    for (i = 0; i < bs; i++)
-      left_col[i] = src[i * src_stride - 1];
+      left_col[i] = ref[i * ref_stride - 1];
  } else {
    vpx_memset(left_col, 129, bs);
  }

+  // above
  if (up_available) {
-    uint8_t *above_ptr = src - src_stride;
+    const uint8_t *above_ref = ref - ref_stride;
    if (bs == 4 && right_available && left_available) {
-      above_row = above_ptr;
+      const_above_row = above_ref;
    } else {
-      vpx_memcpy(above_row, above_ptr, bs);
+      vpx_memcpy(above_row, above_ref, bs);
      if (bs == 4 && right_available)
-        vpx_memcpy(above_row + bs, above_ptr + bs, bs);
+        vpx_memcpy(above_row + bs, above_ref + bs, bs);
      else
        vpx_memset(above_row + bs, above_row[bs - 1], bs);
-      above_row[-1] = left_available ? above_ptr[-1] : 129;
+      above_row[-1] = left_available ? above_ref[-1] : 129;
    }
  } else {
    vpx_memset(above_row, 127, bs * 2);
    above_row[-1] = 127;
  }

+  // predict
  if (mode == DC_PRED) {
-    dc_pred[left_available][up_available][txsz](pred_ptr, stride,
-                                                above_row, left_col);
+    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
+                                                   const_above_row, left_col);
  } else {
-    pred[mode][txsz](pred_ptr, stride, above_row, left_col);
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
  }
 }

-void vp9_predict_intra_block(MACROBLOCKD *xd,
-                            int block_idx,
-                            int bwl_in,
-                            TX_SIZE tx_size,
-                            int mode,
-                            uint8_t *reference, int ref_stride,
-                            uint8_t *predictor, int pre_stride) {
+void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+                            TX_SIZE tx_size, int mode,
+                            const uint8_t *ref, int ref_stride,
+                            uint8_t *dst, int dst_stride) {
  const int bwl = bwl_in - tx_size;
  const int wmask = (1 << bwl) - 1;
  const int have_top = (block_idx >> bwl) || xd->up_available;
@@ -415,10 +380,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
  const int have_right = ((block_idx & wmask) != wmask);

  assert(bwl >= 0);
-  build_intra_predictors(reference, ref_stride,
-                         predictor, pre_stride,
-                         mode,
-                         tx_size,
-                         have_top, have_left,
-                         have_right);
+  build_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size,
+                         have_top, have_left, have_right);
 }
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,17 +14,8 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"

-MB_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                               int stride, int n,
-                                               int tx, int ty);
-
-MB_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block,
-                                          uint8_t *ptr, int stride);
-
-void vp9_predict_intra_block(MACROBLOCKD *xd,
-                            int block_idx,
-                            int bwl_in,
-                            TX_SIZE tx_size,
-                            int mode, uint8_t *ref, int ref_stride,
-                            uint8_t *predictor, int pre_stride);
+void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+                            TX_SIZE tx_size, int mode,
+                            const uint8_t *ref, int ref_stride,
+                            uint8_t *dst, int dst_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
--- a/Show More
+++ b/Show More