Redesigned recursive filters adapted to block-sizes

Recursive intra filters for 4x4 and 8x8 blocks are separately designed. Fixed bugs in rd loop. Change-Id: Id0b1752769f596ce8ea850863cadbc6a739804be
Fixed a bug in recursive extrapolation filter for intra prediction
2013-11-04 12:08:19 -08:00 · 2013-10-12 18:06:34 -07:00 · 2013-09-17 17:45:34 -07:00 · 2013-09-03 17:15:25 -07:00 · 2013-08-27 16:39:08 -07:00 · 2013-08-19 16:44:47 -07:00
169 changed files with 11282 additions and 14397 deletions
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
@@ -13,20 +13,20 @@
 verbose=0
 set -- $*
 for i; do
-    if [ "$i" = "-o" ]; then
+    if [ "$i" == "-o" ]; then
        on_of=1
-    elif [ "$i" = "-v" ]; then
+    elif [ "$i" == "-v" ]; then
        verbose=1
-    elif [ "$i" = "-g" ]; then
+    elif [ "$i" == "-g" ]; then
        args="${args} --debug"
-    elif [ "$on_of" = "1" ]; then
+    elif [ "$on_of" == "1" ]; then
        outfile=$i
        on_of=0
    elif [ -f "$i" ]; then
        infiles="$infiles $i"
-    elif [ "${i#-l}" != "$i" ]; then
+    elif [ "${i:0:2}" == "-l" ]; then
        libs="$libs ${i#-l}"
-    elif [ "${i#-L}" != "$i" ]; then
+    elif [ "${i:0:2}" == "-L" ]; then
        libpaths="${libpaths} ${i#-L}"
    else
        args="${args} ${i}"
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  configure.sh
 ##
@@ -198,11 +198,11 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
-enable_feature(){
+enable(){
    set_all yes $*
 }

-disable_feature(){
+disable(){
    set_all no $*
 }

@@ -219,7 +219,7 @@ soft_enable() {
    for var in $*; do
        if ! disabled $var; then
            log_echo "  enabling $var"
-            enable_feature $var
+            enable $var
        fi
    done
 }
@@ -228,7 +228,7 @@ soft_disable() {
    for var in $*; do
        if ! enabled $var; then
            log_echo "  disabling $var"
-            disable_feature $var
+            disable $var
        fi
    done
 }
@@ -251,10 +251,10 @@ tolower(){
 # Temporary File Functions
 #
 source_path=${0%/*}
-enable_feature source_path_used
+enable source_path_used
 if test -z "$source_path" -o "$source_path" = "." ; then
    source_path="`pwd`"
-    disable_feature source_path_used
+    disable source_path_used
 fi

 if test ! -z "$TMPDIR" ; then
@@ -264,13 +264,12 @@ elif test ! -z "$TEMPDIR" ; then
 else
    TMPDIRx="/tmp"
 fi
-RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
-TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
-TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
-TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
-TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
-TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
-TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"

 clean_temp_files() {
    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
@@ -317,8 +316,8 @@ check_header(){
    header=$1
    shift
    var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable_feature $var
-    check_cpp "$@" <<EOF && enable_feature $var
+    disable $var
+    check_cpp "$@" <<EOF && enable $var
 #include "$header"
 int x;
 EOF
@@ -480,7 +479,7 @@ process_common_cmdline() {
    for opt in "$@"; do
        optval="${opt#*=}"
        case "$opt" in
-        --child) enable_feature child
+        --child) enable child
        ;;
        --log*)
        logging="$optval"
@@ -492,7 +491,7 @@ process_common_cmdline() {
        ;;
        --target=*) toolchain="${toolchain:-${optval}}"
        ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
+        --force-target=*) toolchain="${toolchain:-${optval}}"; enable force_toolchain
        ;;
        --cpu)
        ;;
@@ -512,7 +511,7 @@ process_common_cmdline() {
          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
            die_unknown $opt
        fi
-        ${action}_feature $option
+        $action $option
        ;;
        --require-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
@@ -524,11 +523,11 @@ process_common_cmdline() {
        ;;
        --force-enable-?*|--force-disable-?*)
        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
-        ${action}_feature $option
+        $action $option
        ;;
        --libc=*)
        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        disable_feature builtin_libc
+        disable builtin_libc
        alt_libc="${optval}"
        ;;
        --as=*)
@@ -697,13 +696,13 @@ process_common_toolchain() {

    # Mark the specific ISA requested as enabled
    soft_enable ${tgt_isa}
-    enable_feature ${tgt_os}
-    enable_feature ${tgt_cc}
+    enable ${tgt_os}
+    enable ${tgt_cc}

    # Enable the architecture family
    case ${tgt_isa} in
-        arm*) enable_feature arm;;
-        mips*) enable_feature mips;;
+        arm*) enable arm;;
+        mips*) enable mips;;
    esac

    # PIC is probably what we want when building shared libs
@@ -766,7 +765,7 @@ process_common_toolchain() {
    case ${toolchain} in
        sparc-solaris-*)
            add_extralibs -lposix4
-            disable_feature fast_unaligned
+            disable fast_unaligned
            ;;
        *-solaris-*)
            add_extralibs -lposix4
@@ -791,7 +790,7 @@ process_common_toolchain() {
            ;;
        armv5te)
            soft_enable edsp
-            disable_feature fast_unaligned
+            disable fast_unaligned
            ;;
        esac

@@ -806,7 +805,7 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if [ ${tgt_isa} = "armv7" ]; then
+            if [ ${tgt_isa} == "armv7" ]; then
                if [ -z "${float_abi}" ]; then
                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
@@ -843,8 +842,8 @@ EOF
            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
            AS_SFX=.s
            msvs_arch_dir=arm-msvs
-            disable_feature multithread
-            disable_feature unit_tests
+            disable multithread
+            disable unit_tests
            ;;
        rvct)
            CC=armcc
@@ -856,7 +855,7 @@ EOF
            tune_cflags="--cpu="
            tune_asflags="--cpu="
            if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} = "armv7" ]; then
+                if [ ${tgt_isa} == "armv7" ]; then
                    if enabled neon
                    then
                        check_add_cflags --fpu=softvfp+vfpv3
@@ -881,8 +880,8 @@ EOF

        case ${tgt_os} in
        none*)
-            disable_feature multithread
-            disable_feature os_support
+            disable multithread
+            disable os_support
            ;;

        android*)
@@ -914,9 +913,9 @@ EOF
            # Cortex-A8 implementations (NDK Dev Guide)
            add_ldflags "-Wl,--fix-cortex-a8"

-            enable_feature pic
+            enable pic
            soft_enable realtime_only
-            if [ ${tgt_isa} = "armv7" ]; then
+            if [ ${tgt_isa} == "armv7" ]; then
                soft_enable runtime_cpu_detect
            fi
            if enabled runtime_cpu_detect; then
@@ -970,7 +969,7 @@ EOF
         ;;

        linux*)
-            enable_feature linux
+            enable linux
            if enabled rvct; then
                # Check if we have CodeSourcery GCC in PATH. Needed for
                # libraries
@@ -1001,14 +1000,14 @@ EOF
        tune_cflags="-mtune="
        if enabled dspr2; then
            check_add_cflags -mips32r2 -mdspr2
-            disable_feature fast_unaligned
+            disable fast_unaligned
        fi
        check_add_cflags -march=${tgt_isa}
        check_add_asflags -march=${tgt_isa}
        check_add_asflags -KPIC
    ;;
    ppc*)
-        enable_feature ppc
+        enable ppc
        bits=${tgt_isa##ppc}
        link_with_cc=gcc
        setup_gnu_toolchain
@@ -1156,7 +1155,7 @@ EOF
    ;;
    universal*|*-gcc|generic-gnu)
        link_with_cc=gcc
-        enable_feature gcc
+        enable gcc
    setup_gnu_toolchain
    ;;
    esac
@@ -1192,7 +1191,7 @@ EOF

    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then
      soft_enable use_x86inc
    fi

@@ -1205,14 +1204,14 @@ EOF
    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0

    # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
+    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip

    # Try to determine target endianness
    check_cc <<EOF
    unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
    [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
-        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
+        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian

    # Try to find which inline keywords are supported
    check_cc <<EOF && INLINE="inline"
@@ -1237,7 +1236,7 @@ EOF
            if enabled dspr2; then
                if enabled big_endian; then
                    echo "dspr2 optimizations are available only for little endian platforms"
-                    disable_feature dspr2
+                    disable dspr2
                fi
            fi
        ;;
@@ -1288,8 +1287,8 @@ print_config_h() {

 print_webm_license() {
    local destination=$1
-    local prefix="$2"
-    local suffix="$3"
+    local prefix=$2
+    local suffix=$3
    shift 3
    cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1310,7 +1309,7 @@ process_detect() {
    true;
 }

-enable_feature logging
+enable logging
 logfile="config.log"
 self=$0
 process() {
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/89
+++ b/89
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  configure
 ##
@@ -38,7 +38,6 @@ Advanced options:
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
-  ${toggle_vp9_postproc}          vp9 specific postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -154,7 +153,7 @@ all_targets="libs examples docs"

 # all targets available are enabled, by default.
 for t in ${all_targets}; do
-    [ -f ${source_path}/${t}.mk ] && enable_feature ${t}
+    [ -f ${source_path}/${t}.mk ] && enable ${t}
 done

 # check installed doxygen version
@@ -165,30 +164,30 @@ if [ ${doxy_major:-0} -ge 1 ]; then
    doxy_minor=${doxy_version%%.*}
    doxy_patch=${doxy_version##*.}

-    [ $doxy_major -gt 1 ] && enable_feature doxygen
-    [ $doxy_minor -gt 5 ] && enable_feature doxygen
-    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
+    [ $doxy_major -gt 1 ] && enable doxygen
+    [ $doxy_minor -gt 5 ] && enable doxygen
+    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable doxygen
 fi

 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
-enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs
-enable_feature install_bins
-enable_feature install_libs
+enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
+enable install_bins
+enable install_libs

-enable_feature static
-enable_feature optimizations
-enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
-enable_feature md5
-enable_feature spatial_resampling
-enable_feature multithread
-enable_feature os_support
-enable_feature temporal_denoising
+enable static
+enable optimizations
+enable fast_unaligned #allow unaligned accesses, if supported by hw
+enable md5
+enable spatial_resampling
+enable multithread
+enable os_support
+enable temporal_denoising

-[ -d ${source_path}/../include ] && enable_feature alt_tree_layout
+[ -d ${source_path}/../include ] && enable alt_tree_layout
 for d in vp8 vp9; do
-    [ -d ${source_path}/${d} ] && disable_feature alt_tree_layout;
+    [ -d ${source_path}/${d} ] && disable alt_tree_layout;
 done

 if ! enabled alt_tree_layout; then
@@ -201,10 +200,10 @@ else
 [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
 [ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
 [ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
-[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable_feature vp8_encoder
-[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable_feature vp8_decoder
-[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable_feature vp9_encoder
-[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable_feature vp9_decoder
+[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
+[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
+[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
+[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder

 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
@@ -251,6 +250,10 @@ EXPERIMENT_LIST="
    multiple_arf
    non420
    alpha
+    interintra
+    filterintra
+    masked_interintra
+    masked_interinter
 "
 CONFIG_LIST="
    external_build
@@ -280,7 +283,6 @@ CONFIG_LIST="
    dc_recon
    runtime_cpu_detect
    postproc
-    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -335,7 +337,6 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    postproc
-    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -361,12 +362,12 @@ process_cmdline() {
    for opt do
        optval="${opt#*=}"
        case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
            if enabled experimental; then
-                ${action}_feature $option
+                $action $option
            else
                log_echo "Ignoring $opt -- not in experimental mode."
            fi
@@ -387,8 +388,8 @@ post_process_cmdline() {
    # If the codec family is enabled, enable all components of that family.
    log_echo "Configuring selected codecs"
    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable_feature ${c}
-        enabled ${c%%_*} && enable_feature ${c}
+        disabled ${c%%_*} && disable ${c}
+        enabled ${c%%_*} && enable ${c}
    done

    # Enable all detected codecs, if they haven't been disabled
@@ -396,12 +397,12 @@ post_process_cmdline() {

    # Enable the codec family if any component of that family is enabled
    for c in ${CODECS}; do
-        enabled $c && enable_feature ${c%_*}
+        enabled $c && enable ${c%_*}
    done

    # Set the {en,de}coders variable if any algorithm in that class is enabled
    for c in ${CODECS}; do
-        enabled ${c} && enable_feature ${c##*_}s
+        enabled ${c} && enable ${c##*_}s
    done
 }

@@ -441,7 +442,7 @@ process_targets() {
    done
    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -511,13 +512,13 @@ process_detect() {
    fi
    if [ -z "$CC" ] || enabled external_build; then
        echo "Bypassing toolchain for environment detection."
-        enable_feature external_build
+        enable external_build
        check_header() {
            log fake_check_header "$@"
            header=$1
            shift
            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-            disable_feature $var
+            disable $var
            # Headers common to all environments
            case $header in
                stdio.h)
@@ -529,7 +530,7 @@ process_detect() {
                        [ -f "${d##-I}/$header" ] && result=true && break
                    done
                    ${result:-true}
-            esac && enable_feature $var
+            esac && enable $var

            # Specialize windows and POSIX environments.
            case $toolchain in
@@ -537,7 +538,7 @@ process_detect() {
                    case $header-$toolchain in
                        stdint*-gcc) true;;
                        *) false;;
-                    esac && enable_feature $var
+                    esac && enable $var
                    ;;
                *)
                    case $header in
@@ -546,7 +547,7 @@ process_detect() {
                        sys/mman.h) true;;
                        unistd.h) true;;
                        *) false;;
-                    esac && enable_feature $var
+                    esac && enable $var
            esac
            enabled $var
        }
@@ -564,7 +565,7 @@ EOF
    check_header sys/mman.h
    check_header unistd.h # for sysconf(3) and friends.

-    check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+    check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
 }

 process_toolchain() {
@@ -646,18 +647,14 @@ process_toolchain() {
    # ccache only really works on gcc toolchains
    enabled gcc || soft_disable ccache
    if enabled mips; then
-        enable_feature dequant_tokens
-        enable_feature dc_recon
-    fi
-
-    if enabled internal_stats; then
-        enable_feature vp9_postproc
+        enable dequant_tokens
+        enable dc_recon
    fi

    # Enable the postbuild target if building for visual studio.
    case "$tgt_cc" in
-        vs*) enable_feature msvs
-             enable_feature solution
+        vs*) enable msvs
+             enable solution
             vs_version=${tgt_cc##vs}
             case $vs_version in
             [789])
--- a/examples.mk
+++ b/examples.mk
@@ -49,9 +49,6 @@ vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
-UTILS-$(CONFIG_VP8_ENCODER)    += vp9_spatial_scalable_encoder.c
-vp8_scalable_patterns.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
-vp8_scalable_patterns.DESCRIPTION = Spatial Scalable Encoder

 # Clean up old ivfenc, ivfdec binaries.
 ifeq ($(CONFIG_MSVS),yes)
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef TEST_ACM_RANDOM_H_
-#define TEST_ACM_RANDOM_H_
+#ifndef LIBVPX_TEST_ACM_RANDOM_H_
+#define LIBVPX_TEST_ACM_RANDOM_H_

 #include "third_party/googletest/src/include/gtest/gtest.h"

@@ -38,7 +38,7 @@ class ACMRandom {
    // Returns a random value near 0 or near 255, to better exercise
    // saturation behavior.
    const uint8_t r = Rand8();
-    return r < 128 ? r << 4 : r >> 4;
+    return r <= 128 ? 255 - (r >> 4) : r >> 4;
  }

  int PseudoUniform(int range) {
@@ -59,4 +59,4 @@ class ACMRandom {

 }  // namespace libvpx_test

-#endif  // TEST_ACM_RANDOM_H_
+#endif  // LIBVPX_TEST_ACM_RANDOM_H_
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -29,8 +29,8 @@ class BordersTest : public ::libvpx_test::EncoderTest,

  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 1);
+    if ( video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 0);
      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
--- a/test/clear_system_state.h
+++ b/test/clear_system_state.h
@@ -10,7 +10,7 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_

-#include "./vpx_config.h"
+#include "vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 # include "vpx_ports/x86.h"
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <string.h>
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -188,7 +187,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

 protected:
  static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 256;
+  static const int kOuterBlockSize = 128;
  static const int kInputStride = kOuterBlockSize;
  static const int kOutputStride = kOuterBlockSize;
  static const int kMaxDimension = 64;
@@ -225,10 +224,6 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
      input_[i] = prng.Rand8Extremes();
  }

-  void SetConstantInput(int value) {
-    memset(input_, value, kInputBufferSize);
-  }
-
  void CheckGuardBlocks() {
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
@@ -461,86 +456,45 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
    { 128}
 };

-/* This test exercises the horizontal and vertical filter functions. */
 TEST_P(ConvolveTest, ChangeFilterWorks) {
  uint8_t* const in = input();
  uint8_t* const out = output();
-
-  /* Assume that the first input sample is at the 8/16th position. */
-  const int kInitialSubPelOffset = 8;
-
-  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
-   * at position -3 with respect to the current filtering position. Since
-   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
-   * which is non-zero only in the last tap. So, applying the filter at the
-   * current input position will result in an output equal to the pixel at
-   * offset +4 (-3 + 7) with respect to the current filtering position.
-   */
  const int kPixelSelected = 4;

-  /* Assume that each output pixel requires us to step on by 17/16th pixels in
-   * the input.
-   */
-  const int kInputPixelStep = 17;
-
-  /* The filters are setup in such a way that the expected output produces
-   * sets of 8 identical output samples. As the filter position moves to the
-   * next 1/16th pixel position the only active (=128) filter tap moves one
-   * position to the left, resulting in the same input pixel being replicated
-   * in to the output for 8 consecutive samples. After each set of 8 positions
-   * the filters select a different input pixel. kFilterPeriodAdjust below
-   * computes which input pixel is written to the output for a specified
-   * x or y position.
-   */
-
-  /* Test the horizontal filter. */
  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[kInitialSubPelOffset],
-                                 kInputPixelStep, NULL, 0, Width(), Height()));
+                                 kChangeFilters[8], 17, kChangeFilters[4], 16,
+                                 Width(), Height()));

  for (int x = 0; x < Width(); ++x) {
+    const int kQ4StepAdjust = x >> 4;
    const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
+    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
  }

-  /* Test the vertical filter. */
  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
-                                 kInputPixelStep, Width(), Height()));
+                                 kChangeFilters[4], 16, kChangeFilters[8], 17,
+                                 Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
+    const int kQ4StepAdjust = y >> 4;
    const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
+    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
  }

-  /* Test the horizontal and vertical filters in combination. */
  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[kInitialSubPelOffset],
-                                  kInputPixelStep,
-                                  kChangeFilters[kInitialSubPelOffset],
-                                  kInputPixelStep,
+                                  kChangeFilters[8], 17, kChangeFilters[8], 17,
                                  Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
+    const int kQ4StepAdjustY = y >> 4;
    const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjustY * kInputPixelStep)
-                          >> SUBPEL_BITS);
+    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
    for (int x = 0; x < Width(); ++x) {
+      const int kQ4StepAdjustX = x >> 4;
      const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x =
-          kPixelSelected + ((kInitialSubPelOffset
-              + kFilterPeriodAdjustX * kInputPixelStep)
-                            >> SUBPEL_BITS);
+      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;

      ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
          << "x == " << x << ", y == " << y;
@@ -548,34 +502,6 @@ TEST_P(ConvolveTest, ChangeFilterWorks) {
  }
 }

-/* This test exercises that enough rows and columns are filtered with every
-   possible initial fractional positions and scaling steps. */
-TEST_P(ConvolveTest, CheckScalingFiltering) {
-  uint8_t* const in = input();
-  uint8_t* const out = output();
-
-  SetConstantInput(127);
-
-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                      vp9_sub_pel_filters_8[frac], step,
-                                      vp9_sub_pel_filters_8[frac], step,
-                                      Width(), Height()));
-
-      CheckGuardBlocks();
-
-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
-              << "x == " << x << ", y == " << y
-              << ", frac == " << frac << ", step == " << step;
-        }
-      }
-    }
-  }
-}

 using std::tr1::make_tuple;

--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -108,5 +108,5 @@ using std::tr1::make_tuple;
 VP9_INSTANTIATE_TEST_CASE(
    CpuSpeedTest,
    ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Range(0, 5));
+    ::testing::Range(0, 3));
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -75,7 +75,7 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    bits_in_buffer_model_ -= frame_size_in_bits;

    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits;
+    bits_total_ += frame_size_in_bits ;

    // If first drop not set and we have a drop set it to this time.
    if (!first_drop_ && duration > 1)
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -13,16 +13,15 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
+#include "vpx_ports/mem.h"

 extern "C" {
 #include "vp9/common/vp9_entropy.h"
-#include "./vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
+#include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
 }
+
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,13 +31,12 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
+    return (int)ceil(x - 0.5);
  else
-    return static_cast<int>(floor(x + 0.5));
+    return (int)floor(x + 0.5);
 }
 #endif

-const int kNumCoeffs = 256;
 const double PI = 3.1415926535898;
 void reference2_16x16_idct_2d(double *input, double *output) {
  double x;
@@ -47,9 +45,7 @@ void reference2_16x16_idct_2d(double *input, double *output) {
      double s = 0;
      for (int i = 0; i < 16; ++i) {
        for (int j = 0; j < 16; ++j) {
-          x = cos(PI * j * (l + 0.5) / 16.0) *
-              cos(PI * i * (k + 0.5) / 16.0) *
-              input[i * 16 + j] / 256;
+          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;
          if (i != 0)
            x *= sqrt(2.0);
          if (j != 0)
@@ -63,23 +59,23 @@ void reference2_16x16_idct_2d(double *input, double *output) {
 }


-const double C1 = 0.995184726672197;
-const double C2 = 0.98078528040323;
-const double C3 = 0.956940335732209;
-const double C4 = 0.923879532511287;
-const double C5 = 0.881921264348355;
-const double C6 = 0.831469612302545;
-const double C7 = 0.773010453362737;
-const double C8 = 0.707106781186548;
-const double C9 = 0.634393284163646;
-const double C10 = 0.555570233019602;
-const double C11 = 0.471396736825998;
-const double C12 = 0.38268343236509;
-const double C13 = 0.290284677254462;
-const double C14 = 0.195090322016128;
-const double C15 = 0.098017140329561;
+static const double C1 = 0.995184726672197;
+static const double C2 = 0.98078528040323;
+static const double C3 = 0.956940335732209;
+static const double C4 = 0.923879532511287;
+static const double C5 = 0.881921264348355;
+static const double C6 = 0.831469612302545;
+static const double C7 = 0.773010453362737;
+static const double C8 = 0.707106781186548;
+static const double C9 = 0.634393284163646;
+static const double C10 = 0.555570233019602;
+static const double C11 = 0.471396736825998;
+static const double C12 = 0.38268343236509;
+static const double C13 = 0.290284677254462;
+static const double C14 = 0.195090322016128;
+static const double C15 = 0.098017140329561;

-void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  double step[16];
  double intermediate[16];
  double temp1, temp2;
@@ -112,36 +108,36 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[6] = step[1] - step[6];
  output[7] = step[0] - step[7];

-  temp1 = step[ 8] * C7;
-  temp2 = step[15] * C9;
+  temp1 = step[ 8]*C7;
+  temp2 = step[15]*C9;
  output[ 8] = temp1 + temp2;

-  temp1 = step[ 9] * C11;
-  temp2 = step[14] * C5;
+  temp1 = step[ 9]*C11;
+  temp2 = step[14]*C5;
  output[ 9] = temp1 - temp2;

-  temp1 = step[10] * C3;
-  temp2 = step[13] * C13;
+  temp1 = step[10]*C3;
+  temp2 = step[13]*C13;
  output[10] = temp1 + temp2;

-  temp1 = step[11] * C15;
-  temp2 = step[12] * C1;
+  temp1 = step[11]*C15;
+  temp2 = step[12]*C1;
  output[11] = temp1 - temp2;

-  temp1 = step[11] * C1;
-  temp2 = step[12] * C15;
+  temp1 = step[11]*C1;
+  temp2 = step[12]*C15;
  output[12] = temp2 + temp1;

-  temp1 = step[10] * C13;
-  temp2 = step[13] * C3;
+  temp1 = step[10]*C13;
+  temp2 = step[13]*C3;
  output[13] = temp2 - temp1;

-  temp1 = step[ 9] * C5;
-  temp2 = step[14] * C11;
+  temp1 = step[ 9]*C5;
+  temp2 = step[14]*C11;
  output[14] = temp2 + temp1;

-  temp1 = step[ 8] * C9;
-  temp2 = step[15] * C7;
+  temp1 = step[ 8]*C9;
+  temp2 = step[15]*C7;
  output[15] = temp2 - temp1;

  // step 3
@@ -150,20 +146,20 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  step[ 2] = output[1] - output[2];
  step[ 3] = output[0] - output[3];

-  temp1 = output[4] * C14;
-  temp2 = output[7] * C2;
+  temp1 = output[4]*C14;
+  temp2 = output[7]*C2;
  step[ 4] = temp1 + temp2;

-  temp1 = output[5] * C10;
-  temp2 = output[6] * C6;
+  temp1 = output[5]*C10;
+  temp2 = output[6]*C6;
  step[ 5] = temp1 + temp2;

-  temp1 = output[5] * C6;
-  temp2 = output[6] * C10;
+  temp1 = output[5]*C6;
+  temp2 = output[6]*C10;
  step[ 6] = temp2 - temp1;

-  temp1 = output[4] * C2;
-  temp2 = output[7] * C14;
+  temp1 = output[4]*C2;
+  temp2 = output[7]*C14;
  step[ 7] = temp2 - temp1;

  step[ 8] = output[ 8] + output[11];
@@ -180,18 +176,18 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[ 0] = (step[ 0] + step[ 1]);
  output[ 8] = (step[ 0] - step[ 1]);

-  temp1 = step[2] * C12;
-  temp2 = step[3] * C4;
+  temp1 = step[2]*C12;
+  temp2 = step[3]*C4;
  temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1 * C8);
+  output[ 4] = 2*(temp1*C8);

-  temp1 = step[2] * C4;
-  temp2 = step[3] * C12;
+  temp1 = step[2]*C4;
+  temp2 = step[3]*C12;
  temp1 = temp2 - temp1;
-  output[12] = 2 * (temp1 * C8);
+  output[12] = 2*(temp1*C8);

-  output[ 2] = 2 * ((step[4] + step[ 5]) * C8);
-  output[14] = 2 * ((step[7] - step[ 6]) * C8);
+  output[ 2] = 2*((step[4] + step[ 5])*C8);
+  output[14] = 2*((step[7] - step[ 6])*C8);

  temp1 = step[4] - step[5];
  temp2 = step[6] + step[7];
@@ -201,17 +197,17 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  intermediate[8] = step[8] + step[14];
  intermediate[9] = step[9] + step[15];

-  temp1 = intermediate[8] * C12;
-  temp2 = intermediate[9] * C4;
+  temp1 = intermediate[8]*C12;
+  temp2 = intermediate[9]*C4;
  temp1 = temp1 - temp2;
-  output[3] = 2 * (temp1 * C8);
+  output[3] = 2*(temp1*C8);

-  temp1 = intermediate[8] * C4;
-  temp2 = intermediate[9] * C12;
+  temp1 = intermediate[8]*C4;
+  temp2 = intermediate[9]*C12;
  temp1 = temp2 + temp1;
-  output[13] = 2 * (temp1 * C8);
+  output[13] = 2*(temp1*C8);

-  output[ 9] = 2 * ((step[10] + step[11]) * C8);
+  output[ 9] = 2*((step[10] + step[11])*C8);

  intermediate[11] = step[10] - step[11];
  intermediate[12] = step[12] + step[13];
@@ -222,300 +218,207 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[15] = (intermediate[11] + intermediate[12]);
  output[ 1] = -(intermediate[11] - intermediate[12]);

-  output[ 7] = 2 * (intermediate[13] * C8);
+  output[ 7] = 2*(intermediate[13]*C8);

-  temp1 = intermediate[14] * C12;
-  temp2 = intermediate[15] * C4;
+  temp1 = intermediate[14]*C12;
+  temp2 = intermediate[15]*C4;
  temp1 = temp1 - temp2;
-  output[11] = -2 * (temp1 * C8);
+  output[11] = -2*(temp1*C8);

-  temp1 = intermediate[14] * C4;
-  temp2 = intermediate[15] * C12;
+  temp1 = intermediate[14]*C4;
+  temp2 = intermediate[15]*C12;
  temp1 = temp2 + temp1;
-  output[ 5] = 2 * (temp1 * C8);
+  output[ 5] = 2*(temp1*C8);
 }

-void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
+static void reference_16x16_dct_1d(double in[16], double out[16]) {
+  const double kPi = 3.141592653589793238462643383279502884;
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 16; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 16; n++)
+      out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0);
+    if (k == 0)
+      out[k] = out[k]*kInvSqrt2;
+  }
+}
+
+void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
  // First transform columns
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = input[j * 16 + i];
+      temp_in[j] = input[j*16 + i];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    for (int j = 0; j < 16; ++j)
-      output[j * 16 + i] = temp_out[j];
+      output[j*16 + i] = temp_out[j];
  }
  // Then transform rows
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = output[j + i * 16];
+      temp_in[j] = output[j + i*16];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    // Scale by some magic number
    for (int j = 0; j < 16; ++j)
-      output[j + i * 16] = temp_out[j]/2;
+      output[j + i*16] = temp_out[j]/2;
  }
 }

-typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *out, int stride);
-typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
-
-void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+               int stride, int /*tx_type*/) {
  vp9_short_fdct16x16_c(in, out, stride);
 }
-
-void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_short_fht16x16_c(in, out, stride, tx_type);
+void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                   int stride, int /*tx_type*/) {
+  vp9_short_idct16x16_add_c(out, dst, stride >> 1);
 }
-
-class Trans16x16TestBase {
- public:
-  virtual ~Trans16x16TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
-      }
-
-      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
-                                      test_temp_block, pitch_));
-      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = dst[j] - src[j];
-        const uint32_t error = diff * diff;
-        if (max_error < error)
-          max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(1u, max_error)
-        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
-
-    EXPECT_GE(count_test_block , total_error)
-        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-      }
-      if (i == 0)
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
-      if (i == 1)
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
-                                      output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      double out_r[kNumCoeffs];
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
-      }
-
-      reference_16x16_dct_2d(in, out_r);
-      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = round(out_r[j]);
-
-      const int pitch = 32;
-      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = dst[j] - src[j];
-        const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error)
-            << "Error: 16x16 IDCT has error " << error
-            << " at index " << j;
-      }
-    }
-  }
-  int pitch_;
-  int tx_type_;
-  fht_t fwd_txfm_ref;
-};
-
-class Trans16x16DCT : public Trans16x16TestBase,
-                      public PARAMS(fdct_t, idct_t, int) {
- public:
-  virtual ~Trans16x16DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_  = GET_PARAM(2);
-    pitch_    = 32;
-    fwd_txfm_ref = fdct16x16_ref;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride >> 1);
-  }
-
-  fdct_t fwd_txfm_;
-  idct_t inv_txfm_;
-};
-
-TEST_P(Trans16x16DCT, AccuracyCheck) {
-  RunAccuracyCheck();
-}
-
-TEST_P(Trans16x16DCT, CoeffCheck) {
-  RunCoeffCheck();
-}
-
-TEST_P(Trans16x16DCT, MemCheck) {
-  RunMemCheck();
-}
-
-TEST_P(Trans16x16DCT, InvAccuracyCheck) {
-  RunInvAccuracyCheck();
-}
-
-class Trans16x16HT : public Trans16x16TestBase,
-                     public PARAMS(fht_t, iht_t, int) {
- public:
-  virtual ~Trans16x16HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_  = GET_PARAM(2);
-    pitch_    = 16;
-    fwd_txfm_ref = fht16x16_ref;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
-    fwd_txfm_(in, out, stride, tx_type_);
-  }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, tx_type_);
-  }
-
-  fht_t fwd_txfm_;
-  iht_t inv_txfm_;
-};
-
-TEST_P(Trans16x16HT, AccuracyCheck) {
-  RunAccuracyCheck();
-}
-
-TEST_P(Trans16x16HT, CoeffCheck) {
-  RunCoeffCheck();
-}
-
-TEST_P(Trans16x16HT, MemCheck) {
-  RunMemCheck();
-}
-
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
-    C, Trans16x16DCT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
-INSTANTIATE_TEST_CASE_P(
-    C, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
-
+void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+              int stride, int tx_type) {
+  // FIXME(jingning): need to test both SSE2 and c
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans16x16DCT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct16x16_sse2, &vp9_short_idct16x16_add_c, 0)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
+  vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
+#else
+  vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
 #endif
+}
+void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+  vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
+}
+
+class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~FwdTrans16x16Test() {}
+
+  virtual void SetUp() {
+    tx_type_ = GetParam();
+    if (tx_type_ == 0) {
+      fwd_txfm = fdct16x16;
+      inv_txfm = idct16x16_add;
+    } else {
+      fwd_txfm = fht16x16;
+      inv_txfm = iht16x16_add;
+    }
+  }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*fwd_txfm)(in, out, dst, stride, tx_type);
+  }
+  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*inv_txfm)(in, out, dst, stride, tx_type);
+  }
+
+  int tx_type_;
+  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
+
+TEST_P(FwdTrans16x16Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 10000;
+  for (int i = 0; i < count_test_block; ++i) {
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
+
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+      // Initialize a test block with input range [-255, 255].
+      test_input_block[j] = src[j] - dst[j];
+    }
+
+    const int pitch = 32;
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+
+    for (int j = 0; j < 256; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1, max_error)
+      << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+
+  EXPECT_GE(count_test_block , total_error)
+      << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
+}
+
+TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 256; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 32;
+    RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
+    RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 256; ++j) {
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 16x16 FDCT extreme has coefficient larger "
+          << "than 4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
+
+TEST(VP9Idct16x16Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t in[256], coeff[256];
+    uint8_t dst[256], src[256];
+    double out_r[256];
+
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j)
+      in[j] = src[j] - dst[j];
+
+    reference_16x16_dct_2d(in, out_r);
+    for (int j = 0; j < 256; j++)
+      coeff[j] = round(out_r[j]);
+    vp9_short_idct16x16_add_c(coeff, dst, 16);
+    for (int j = 0; j < 256; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 16x16 IDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -13,17 +13,15 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"

 extern "C" {
-#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
+  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }

+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,15 +30,35 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
+    return (int)ceil(x - 0.5);
  else
-    return static_cast<int>(floor(x + 0.5));
+    return (int)floor(x + 0.5);
 }
 #endif

-const int kNumCoeffs = 1024;
-const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
+static const double kPi = 3.141592653589793238462643383279502884;
+static void reference2_32x32_idct_2d(double *input, double *output) {
+  double x;
+  for (int l = 0; l < 32; ++l) {
+    for (int k = 0; k < 32; ++k) {
+      double s = 0;
+      for (int i = 0; i < 32; ++i) {
+        for (int j = 0; j < 32; ++j) {
+          x = cos(kPi * j * (l + 0.5) / 32.0) *
+              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
+          if (i != 0)
+            x *= sqrt(2.0);
+          if (j != 0)
+            x *= sqrt(2.0);
+          s += x;
+        }
+      }
+      output[k * 32 + l] = s / 4;
+    }
+  }
+}
+
+static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
  const double kInvSqrt2 = 0.707106781186547524400844362104;
  for (int k = 0; k < 32; k++) {
    out[k] = 0.0;
@@ -51,8 +69,7 @@ void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
  }
 }

-void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
-                            double output[kNumCoeffs]) {
+static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
  // First transform columns
  for (int i = 0; i < 32; ++i) {
    double temp_in[32], temp_out[32];
@@ -74,165 +91,27 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
  }
 }

-typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
-
-class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
- public:
-  virtual ~Trans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    version_  = GET_PARAM(2);  // 0: high precision forward transform
-                               // 1: low precision version for rd loop
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  int version_;
-  fwd_txfm_t fwd_txfm_;
-  inv_txfm_t inv_txfm_;
-};
-
-TEST_P(Trans32x32Test, AccuracyCheck) {
+TEST(VP9Idct32x32Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  uint32_t max_error = 0;
-  int64_t total_error = 0;
  const int count_test_block = 1000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < kNumCoeffs; ++j) {
+    int16_t in[1024], coeff[1024];
+    uint8_t dst[1024], src[1024];
+    double out_r[1024];
+
+    for (int j = 0; j < 1024; ++j) {
      src[j] = rnd.Rand8();
      dst[j] = rnd.Rand8();
-      test_input_block[j] = src[j] - dst[j];
    }
-
-    const int pitch = 64;
-    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
-    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
-
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      const uint32_t diff = dst[j] - src[j];
-      const uint32_t error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  if (version_ == 1) {
-    max_error /= 2;
-    total_error /= 45;
-  }
-
-  EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
-}
-
-TEST_P(Trans32x32Test, CoeffCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    for (int j = 0; j < kNumCoeffs; ++j)
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
-
-    if (version_ == 0) {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-    } else {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-    }
-  }
-}
-
-TEST_P(Trans32x32Test, MemCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 2000;
-
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
-
-  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = 255;
-    if (i == 1)
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = -255;
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (version_ == 0) {
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-      } else {
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-      }
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
-          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than "
-          << "4*DCT_MAX_VALUE";
-    }
-  }
-}
-
-TEST_P(Trans32x32Test, InverseAccuracy) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    double out_r[kNumCoeffs];
-
-    // Initialize a test block with input range [-255, 255]
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
+    for (int j = 0; j < 1024; ++j)
      in[j] = src[j] - dst[j];
-    }

    reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < kNumCoeffs; ++j)
+    for (int j = 0; j < 1024; j++)
      coeff[j] = round(out_r[j]);
-    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
-    for (int j = 0; j < kNumCoeffs; ++j) {
+    vp9_short_idct32x32_add_c(coeff, dst, 32);
+    for (int j = 0; j < 1024; ++j) {
      const int diff = dst[j] - src[j];
      const int error = diff * diff;
      EXPECT_GE(1, error)
@@ -242,21 +121,72 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
  }
 }

-using std::tr1::make_tuple;
+TEST(VP9Fdct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  unsigned int max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[1024];
+    int16_t test_temp_block[1024];
+    uint8_t dst[1024], src[1024];

-INSTANTIATE_TEST_CASE_P(
-    C, Trans32x32Test,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
-        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      test_input_block[j] = src[j] - dst[j];

-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans32x32Test,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct32x32_sse2,
-                   &vp9_short_idct32x32_add_sse2, 0),
-        make_tuple(&vp9_short_fdct32x32_rd_sse2,
-                   &vp9_short_idct32x32_add_sse2, 1)));
-#endif
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
+
+    for (int j = 0; j < 1024; ++j) {
+      const unsigned diff = dst[j] - src[j];
+      const unsigned error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
+}
+
+TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[1024], input_extreme_block[1024];
+    int16_t output_block[1024], output_extreme_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 1024; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_block, pitch);
+    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 1024; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 32x32 FDCT extreme has coefficient larger than "
+             "4*DCT_MAX_VALUE";
+    }
+  }
+}
 }  // namespace
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -12,7 +12,7 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
+#include "vpx_config.h"
 #include "vpx/vpx_decoder.h"

 namespace libvpx_test {
@@ -36,8 +36,9 @@ class DxDataIterator {
 };

 // Provides a simplified interface to manage one video decoding.
-// Similar to Encoder class, the exact services should be added
-// as more tests are added.
+//
+// TODO: similar to Encoder class, the exact services should be
+// added as more tests are added.
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "./vpx_config.h"
+#include "vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/decode_test_driver.h"
@@ -114,19 +114,19 @@ static bool compare_img(const vpx_image_t *img1,
  const unsigned int height_y = img1->d_h;
  unsigned int i;
  for (i = 0; i < height_y; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                    width_y) == 0) && match;
+    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     width_y) == 0) && match;
  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
  const unsigned int height_uv = (img1->d_h + 1) >> 1;
  for (i = 0; i <  height_uv; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                    width_uv) == 0) && match;
+    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     width_uv) == 0) && match;
  for (i = 0; i < height_uv; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                    width_uv) == 0) && match;
+    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     width_uv) == 0) && match;
  return match;
 }

@@ -158,7 +158,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
    bool again;
    for (again = true, video->Begin(); again; video->Next()) {
-      again = (video->img() != NULL);
+      again = video->img() != NULL;

      PreEncodeFrameHook(video);
      PreEncodeFrameHook(video, encoder);
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -62,7 +62,7 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    if (droppable_nframes_ > 0 &&
        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
-        if (droppable_frames_[i] == video->frame()) {
+        if (droppable_frames_[i] == nframes_) {
          std::cout << "             Encoding droppable frame: "
                    << droppable_frames_[i] << "\n";
          frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
@@ -148,7 +148,7 @@ TEST_P(ErrorResilienceTest, OnVersusOff) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 10;
+  cfg_.g_lag_in_frames = 25;

  init_flags_ = VPX_CODEC_USE_PSNR;

@@ -179,9 +179,6 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 500;
-  // FIXME(debargha): Fix this to work for any lag.
-  // Currently this test only works for lag = 0
-  cfg_.g_lag_in_frames = 0;

  init_flags_ = VPX_CODEC_USE_PSNR;

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "./vp9_rtcd.h"
+#include "vp9_rtcd.h"
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

@@ -136,7 +136,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  int max_error = 0;
-  int total_error = 0;
+  double total_error = 0;
  const int count_test_block = 1000000;
  for (int i = 0; i < count_test_block; ++i) {
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
@@ -156,7 +156,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
-        if (test_temp_block[j] > 0) {
+        if(test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,16 +13,14 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
 #include "vpx_ports/mem.h"

 extern "C" {
-#include "./vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
+#include "vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -64,7 +62,6 @@ class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
      inv_txfm = iht8x8_add;
    }
  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }

 protected:
  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
@@ -95,9 +92,8 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_output_block,
-                   NULL, pitch, tx_type_));
+
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -125,9 +121,8 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-15, 15].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_output_block,
-                   NULL, pitch, tx_type_));
+
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -153,7 +148,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
 TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  int total_error = 0;
+  double total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
@@ -170,11 +165,9 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
-    for (int j = 0; j < 64; ++j) {
-        if (test_temp_block[j] > 0) {
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    for (int j = 0; j < 64; ++j){
+        if(test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
@@ -184,9 +177,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
          test_temp_block[j] *= 4;
        }
    }
-    REGISTER_STATE_CHECK(
-        RunInvTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -208,7 +199,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
 TEST_P(FwdTrans8x8Test, ExtremalCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  int total_error = 0;
+  double total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
@@ -225,12 +216,8 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
-    REGISTER_STATE_CHECK(
-        RunInvTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -11,7 +11,6 @@
 #define TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
-#include <string>

 #include "test/video_source.h"

@@ -35,6 +34,7 @@ class I420VideoSource : public VideoSource {
        height_(0),
        framerate_numerator_(rate_numerator),
        framerate_denominator_(rate_denominator) {
+
    // This initializes raw_sz_, width_, height_ and allocates an img.
    SetSize(width, height);
  }
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "./vp9_rtcd.h"
+#include "vp9_rtcd.h"
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -27,10 +27,10 @@ namespace {

 #ifdef _MSC_VER
 static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
+  if(x < 0)
+    return (int)ceil(x - 0.5);
  else
-    return static_cast<int>(floor(x + 0.5));
+    return (int)floor(x + 0.5);
 }
 #endif

--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -16,9 +16,7 @@ extern "C" {
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

-#include "vpx/vpx_integer.h"
-
-typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
+typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
                          int pred_stride, unsigned char *dst_ptr,
                          int dst_stride);
 namespace {
@@ -36,7 +34,7 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
  virtual void TearDown() { libvpx_test::ClearSystemState(); }

  idct_fn_t UUT;
-  int16_t input[16];
+  short input[16];
  unsigned char output[256];
  unsigned char predict[256];
 };
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -15,8 +15,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -106,9 +106,9 @@ class IntraPredBase {
          for (int y = 0; y < block_size_; y++)
            sum += data_ptr_[p][y * stride_ - 1];
        expected = (sum + (1 << (shift - 1))) >> shift;
-      } else {
+      } else
        expected = 0x80;
-      }
+
      // check that all subsequent lines are equal to the first
      for (int y = 1; y < block_size_; ++y)
        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -28,7 +28,7 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 // so that we can do actual file decodes.
 class IVFVideoSource : public CompressedVideoSource {
 public:
-  explicit IVFVideoSource(const std::string &file_name)
+  IVFVideoSource(const std::string &file_name)
      : file_name_(file_name),
        input_file_(NULL),
        compressed_frame_buf_(NULL),
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -132,6 +132,7 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
  // Verify that keyframes match the file keyframes in the file.
  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
       iter != kf_pts_list_.end(); ++iter) {
+
    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
        << *iter;
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef TEST_MD5_HELPER_H_
-#define TEST_MD5_HELPER_H_
+#ifndef LIBVPX_TEST_MD5_HELPER_H_
+#define LIBVPX_TEST_MD5_HELPER_H_

 extern "C" {
 #include "./md5_utils.h"
@@ -25,15 +25,9 @@ class MD5 {

  void Add(const vpx_image_t *img) {
    for (int plane = 0; plane < 3; ++plane) {
-      const uint8_t *buf = img->planes[plane];
-      // Calculate the width and height to do the md5 check. For the chroma
-      // plane, we never want to round down and thus skip a pixel so if
-      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
-      // This works only for chroma_shift of 0 and 1.
-      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
-                    img->y_chroma_shift : img->d_h;
-      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
-                    img->x_chroma_shift : img->d_w;
+      uint8_t *buf = img->planes[plane];
+      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
@@ -67,4 +61,4 @@ class MD5 {

 }  // namespace libvpx_test

-#endif  // TEST_MD5_HELPER_H_
+#endif  // LIBVPX_TEST_MD5_HELPER_H_
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -11,8 +11,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -63,8 +63,7 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
  // Pointers to top-left pixel of block in the input and output images.
  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
  uint8_t *const dst_image_ptr = dst_image + 8;
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
  (void)vpx_memset(flimits, 255, block_width);

  // Initialize pixels in the input:
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef TEST_REGISTER_STATE_CHECK_H_
-#define TEST_REGISTER_STATE_CHECK_H_
+#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_

 #ifdef _WIN64

@@ -92,4 +92,4 @@ class RegisterStateCheck {};

 #endif  // _WIN64

-#endif  // TEST_REGISTER_STATE_CHECK_H_
+#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -16,68 +16,8 @@
 #include "test/video_source.h"
 #include "test/util.h"

-// Enable(1) or Disable(0) writing of the compressed bitstream.
-#define WRITE_COMPRESSED_STREAM 0
-
 namespace {

-#if WRITE_COMPRESSED_STREAM
-static void mem_put_le16(char *const mem, const unsigned int val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-}
-
-static void mem_put_le32(char *const mem, const unsigned int val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-  mem[2] = val >> 16;
-  mem[3] = val >> 24;
-}
-
-static void write_ivf_file_header(const vpx_codec_enc_cfg_t *const cfg,
-                                  int frame_cnt, FILE *const outfile) {
-  char header[32];
-
-  header[0] = 'D';
-  header[1] = 'K';
-  header[2] = 'I';
-  header[3] = 'F';
-  mem_put_le16(header + 4,  0);                   /* version */
-  mem_put_le16(header + 6,  32);                  /* headersize */
-  mem_put_le32(header + 8,  0x30395056);          /* fourcc (vp9) */
-  mem_put_le16(header + 12, cfg->g_w);            /* width */
-  mem_put_le16(header + 14, cfg->g_h);            /* height */
-  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
-  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
-  mem_put_le32(header + 24, frame_cnt);           /* length */
-  mem_put_le32(header + 28, 0);                   /* unused */
-
-  (void)fwrite(header, 1, 32, outfile);
-}
-
-static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
-  char header[4];
-  mem_put_le32(header, static_cast<unsigned int>(size));
-  (void)fwrite(header, 1, 4, outfile);
-}
-
-static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
-                                   FILE *const outfile) {
-  char header[12];
-  vpx_codec_pts_t pts;
-
-  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-    return;
-
-  pts = pkt->data.frame.pts;
-  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
-  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
-  mem_put_le32(header + 8, pts >> 32);
-
-  (void)fwrite(header, 1, 12, outfile);
-}
-#endif  // WRITE_COMPRESSED_STREAM
-
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;

@@ -102,8 +42,6 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
    limit_ = 60;
  }

-  virtual ~ResizingVideoSource() {}
-
 protected:
  virtual void Next() {
    ++frame_;
@@ -118,15 +56,13 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 protected:
  ResizeTest() : EncoderTest(GET_PARAM(0)) {}

-  virtual ~ResizeTest() {}
-
  struct FrameInfo {
    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
        : pts(_pts), w(_w), h(_h) {}

    vpx_codec_pts_t pts;
-    unsigned int w;
-    unsigned int h;
+    unsigned int    w;
+    unsigned int    h;
  };

  virtual void SetUp() {
@@ -159,47 +95,17 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
  }
 }

-const unsigned int kStepDownFrame = 3;
-const unsigned int kStepUpFrame = 6;
-
 class ResizeInternalTest : public ResizeTest {
 protected:
-#if WRITE_COMPRESSED_STREAM
-  ResizeInternalTest()
-      : ResizeTest(),
-        frame0_psnr_(0.0),
-        outfile_(NULL),
-        out_frames_(0) {}
-#else
  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
-#endif
-
-  virtual ~ResizeInternalTest() {}
-
-  virtual void BeginPassHook(unsigned int /*pass*/) {
-#if WRITE_COMPRESSED_STREAM
-    outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
-#endif
-  }
-
-  virtual void EndPassHook() {
-#if WRITE_COMPRESSED_STREAM
-    if (outfile_) {
-      if (!fseek(outfile_, 0, SEEK_SET))
-        write_ivf_file_header(&cfg_, out_frames_, outfile_);
-      fclose(outfile_);
-      outfile_ = NULL;
-    }
-#endif
-  }

  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == kStepDownFrame) {
+    if (video->frame() == 3) {
      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
-    if (video->frame() == kStepUpFrame) {
+    if (video->frame() == 6) {
      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
@@ -211,46 +117,21 @@ class ResizeInternalTest : public ResizeTest {
    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);
  }

-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-#if WRITE_COMPRESSED_STREAM
-    ++out_frames_;
-
-    // Write initial file header if first frame.
-    if (pkt->data.frame.pts == 0)
-      write_ivf_file_header(&cfg_, 0, outfile_);
-
-    // Write frame header and data.
-    write_ivf_frame_header(pkt, outfile_);
-    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
-#endif
-  }
-
  double frame0_psnr_;
-#if WRITE_COMPRESSED_STREAM
-  FILE *outfile_;
-  unsigned int out_frames_;
-#endif
 };

 TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 10);
  init_flags_ = VPX_CODEC_USE_PSNR;
-
  // q picked such that initial keyframe on this clip is ~30dB PSNR
  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
-
-  // If the number of frames being encoded is smaller than g_lag_in_frames
-  // the encoded frame is unavailable using the current API. Comparing
-  // frames to detect mismatch would then not be possible. Set
-  // g_lag_in_frames = 0 to get around this.
-  cfg_.g_lag_in_frames = 0;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

  for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    const vpx_codec_pts_t pts = info->pts;
-    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
+    if (pts >= 3 && pts < 6) {
      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
    } else {
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -17,6 +17,7 @@ extern "C" {
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
+//#include "vp8/common/blockd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -17,19 +17,15 @@
 #include <sys/types.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 extern "C" {
 #include "vp8/encoder/onyx_int.h"
 }

-using libvpx_test::ACMRandom;
-
 namespace {

 TEST(Vp8RoiMapTest, ParameterCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
@@ -125,10 +121,10 @@ TEST(Vp8RoiMapTest, ParameterCheck) {
    for (int i = 0; i < 1000; ++i) {
      int rand_deltas[4];
      int deltas_valid;
-      rand_deltas[0] = rnd(160) - 80;
-      rand_deltas[1] = rnd(160) - 80;
-      rand_deltas[2] = rnd(160) - 80;
-      rand_deltas[3] = rnd(160) - 80;
+      rand_deltas[0] = (rand() % 160) - 80;
+      rand_deltas[1] = (rand() % 160) - 80;
+      rand_deltas[2] = (rand() % 160) - 80;
+      rand_deltas[3] = (rand() % 160) - 80;

      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
                      (abs(rand_deltas[1]) <= 63) &&
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -13,8 +13,8 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 extern "C" {
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
@@ -51,7 +51,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
  bd.predictor = reinterpret_cast<unsigned char*>(
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));

-  for (int i = 0; kSrcStride[i] > 0; ++i) {
+  for(int i = 0; kSrcStride[i] > 0; ++i) {
    // start at block0
    be.src = 0;
    be.base_src = &source;
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -520,12 +520,3 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
-b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
-65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
-4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba  vp90-2-05-resize.ivf
-7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
-bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
-f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
-495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
-65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
-
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,7 +24,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc

@@ -629,11 +629,3 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string>
-#include "./vpx_config.h"
+#include "vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -48,9 +48,7 @@ int main(int argc, char **argv) {
 #endif

 #if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
-
+  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
 #if CONFIG_VP8
  vp8_rtcd();
 #endif
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,11 +159,7 @@ const char *kVP9TestVectors[] = {
  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
-  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
-#if CONFIG_NON420
-  "vp91-2-04-yv444.webm"
-#endif
+  "vp90-2-03-size-226x226.webm"
 };
 #endif

--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -16,16 +16,16 @@
 #include "test/register_state_check.h"

 #include "vpx/vpx_integer.h"
-#include "./vpx_config.h"
+#include "vpx_config.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
-# include "./vp8_rtcd.h"
+# include "vp8_rtcd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 # include "vp9/encoder/vp9_variance.h"
-# include "./vp9_rtcd.h"
+# include "vp9_rtcd.h"
 #endif
 }
 #include "test/acm_random.h"
@@ -107,8 +107,8 @@ static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
 }

 template<typename VarianceFunctionType>
-class VarianceTest
-    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+class VarianceTest :
+    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
@@ -191,9 +191,9 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
 }

 template<typename SubpelVarianceFunctionType>
-class SubpelVarianceTest
-    : public ::testing::TestWithParam<tuple<int, int,
-                                            SubpelVarianceFunctionType> > {
+class SubpelVarianceTest :
+    public ::testing::TestWithParam<tuple<int, int,
+                                          SubpelVarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, SubpelVarianceFunctionType>& params =
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -8,6 +8,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+extern "C" {
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
+}

 #include <math.h>
 #include <stddef.h>
@@ -20,11 +24,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"

-extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
-}
-
 namespace {
 const int num_tests = 10;

@@ -45,7 +44,7 @@ void encrypt_buffer(uint8_t *buffer, int size) {

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                           uint8_t *output, int count) {
-  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
+  int offset = input - (uint8_t *)decrypt_state;
  for (int i = 0; i < count; i++) {
    output[i] = input[i] ^ secret_key[(offset + i) & 15];
  }
@@ -59,10 +58,10 @@ TEST(VP8, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int kBitsToTest = 1000;
-      uint8_t probas[kBitsToTest];
+      const int bits_to_test = 1000;
+      uint8_t probas[bits_to_test];

-      for (int i = 0; i < kBitsToTest; ++i) {
+      for (int i = 0; i < bits_to_test; ++i) {
        const int parity = i & 1;
        probas[i] =
            (method == 0) ? 0 : (method == 1) ? 255 :
@@ -77,14 +76,14 @@ TEST(VP8, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int kBufferSize = 10000;
+        const int buffer_size = 10000;
        ACMRandom bit_rnd(random_seed);
        BOOL_CODER bw;
-        uint8_t bw_buffer[kBufferSize];
-        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);
+        uint8_t bw_buffer[buffer_size];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -99,20 +98,19 @@ TEST(VP8, TestBitIO) {
 #if CONFIG_DECRYPT
        encrypt_buffer(bw_buffer, buffer_size);
        vp8dx_start_decode(&br, bw_buffer, buffer_size,
-                           test_decrypt_cb,
-                           reinterpret_cast<void *>(bw_buffer));
+                           test_decrypt_cb, (void *)bw_buffer);
 #else
-        vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL);
+        vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL);
 #endif
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: "<< i << " / " << kBitsToTest
+              << "pos: "<< i << " / " << bits_to_test
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -26,8 +26,7 @@ const uint8_t test_key[16] = {
  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };

-void encrypt_buffer(const uint8_t *src, uint8_t *dst,
-                    int size, int offset = 0) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) {
  for (int i = 0; i < size; ++i) {
    dst[i] = src[i] ^ test_key[(offset + i) & 15];
  }
@@ -35,11 +34,10 @@ void encrypt_buffer(const uint8_t *src, uint8_t *dst,

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                     uint8_t *output, int count) {
-  encrypt_buffer(input, output, count,
-                 input - reinterpret_cast<uint8_t *>(decrypt_state));
+  encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state);
 }

-}  // namespace
+} // namespace

 namespace libvpx_test {

--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -18,7 +18,7 @@


 extern "C" {
-#include "./vp8_rtcd.h"
+#include "vp8_rtcd.h"
 }

 #include "test/acm_random.h"
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -19,7 +19,7 @@ extern "C" {
 #include "vp9/decoder/vp9_dboolhuff.h"
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,10 +32,10 @@ TEST(VP9, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int kBitsToTest = 1000;
-      uint8_t probas[kBitsToTest];
+      const int bits_to_test = 1000;
+      uint8_t probas[bits_to_test];

-      for (int i = 0; i < kBitsToTest; ++i) {
+      for (int i = 0; i < bits_to_test; ++i) {
        const int parity = i & 1;
        probas[i] =
          (method == 0) ? 0 : (method == 1) ? 255 :
@@ -50,14 +50,14 @@ TEST(VP9, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int kBufferSize = 10000;
+        const int buffer_size = 10000;
        ACMRandom bit_rnd(random_seed);
        vp9_writer bw;
-        uint8_t bw_buffer[kBufferSize];
+        uint8_t bw_buffer[buffer_size];
        vp9_start_encode(&bw, bw_buffer);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -72,16 +72,16 @@ TEST(VP9, TestBitIO) {
        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);

        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, kBufferSize);
+        vp9_reader_init(&br, bw_buffer, buffer_size);
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
-              << "pos: " << i << " / " << kBitsToTest
+              << "pos: " << i << " / " << bits_to_test
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -39,8 +39,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
-       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+  for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZE_TYPES;
+       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
    const int block_width  = 4 << b_width_log2(bsize);
    const int block_height = 4 << b_height_log2(bsize);
    int16_t *diff = reinterpret_cast<int16_t *>(
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -41,8 +41,7 @@ extern "C"
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-        USAGE_CONSTRAINED_QUALITY   = 0x2,
-        USAGE_CONSTANT_QUALITY      = 0x3
+        USAGE_CONSTRAINED_QUALITY   = 0x2
    } END_USAGE;


--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -313,7 +313,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    /* Get baseline error score */

    /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+    vp8_yv12_copy_y(saved_frame, cm->frame_to_show);

    vp8cx_set_alt_lf_level(cpi, filt_mid);
    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
@@ -339,7 +339,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
            if(ss_err[filt_low] == 0)
            {
                /* Get Low filter error score */
-                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_low);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

@@ -367,7 +367,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
        {
            if(ss_err[filt_high] == 0)
            {
-                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_high);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -153,7 +153,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -204,7 +204,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
+    if(finalize && cfg->rc_end_usage == VPX_CQ)
        RANGE_CHECK(vp8_cfg, cq_level,
                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);

@@ -327,14 +327,17 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
    oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;

-    if (cfg.rc_end_usage == VPX_VBR) {
-      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
-    } else if (cfg.rc_end_usage == VPX_CBR) {
-      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
-    } else if (cfg.rc_end_usage == VPX_CQ) {
-      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
-    } else if (cfg.rc_end_usage == VPX_Q) {
-      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
+    if (cfg.rc_end_usage == VPX_VBR)
+    {
+        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    }
+    else if (cfg.rc_end_usage == VPX_CBR)
+    {
+        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    }
+    else if (cfg.rc_end_usage == VPX_CQ)
+    {
+        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
    }

    oxcf->target_bandwidth         = cfg.rc_target_bitrate;
@@ -1269,7 +1272,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        1,                  /* g_delete_first_pass_file */
        "vp8.fpf"           /* first pass filename */
 #endif
-        VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+
        1,                  /* ts_number_layers */
        {0},                /* ts_target_bitrate */
        {0},                /* ts_rate_decimator */
--- a/vp8_scalable_patterns.c
+++ b/vp8_scalable_patterns.c
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string.h>
+#include <time.h>
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
@@ -137,6 +138,8 @@ int main(int argc, char **argv) {
    int                  layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
    int                  flag_periodicity;
    int                  max_intra_size_pct;
+    clock_t              before;
+    clock_t              after;

    /* Check usage and arguments */
    if (argc < 9)
@@ -639,6 +642,7 @@ int main(int argc, char **argv) {
    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                      max_intra_size_pct);

+    before = clock();
    frame_avail = 1;
    while (frame_avail || got_data) {
        vpx_codec_iter_t iter = NULL;
@@ -660,8 +664,8 @@ int main(int argc, char **argv) {
            got_data = 1;
            switch (pkt->kind) {
            case VPX_CODEC_CX_FRAME_PKT:
-                for (i=cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
-                                              i<cfg.ts_number_layers; i++)
+                for (i = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
+                                              i < cfg.ts_number_layers; i++)
                {
                    write_ivf_frame_header(outfile[i], pkt);
                    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
@@ -676,9 +680,13 @@ int main(int argc, char **argv) {
        frame_cnt++;
        pts += frame_duration;
    }
+    after = clock();
+
+    printf("Processed %d frames in %ld ms.\n", frame_cnt-1,
+           (int) (after - before) / (CLOCKS_PER_SEC / 1000));
+
    fclose (infile);

-    printf ("Processed %d frames.\n",frame_cnt-1);
    if (vpx_codec_destroy(&codec))
            die_codec (&codec, "Failed to destroy codec");

--- a/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_avg_neon.asm
@@ -1,116 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_convolve_avg_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_convolve_avg_neon| PROC
-    push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
-    mov                 r6, r2
-
-    cmp                 r4, #32
-    bgt                 avg64
-    beq                 avg32
-    cmp                 r4, #8
-    bgt                 avg16
-    beq                 avg8
-    b                   avg4
-
-avg64
-    sub                 lr, r1, #32
-    sub                 r4, r3, #32
-avg64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    pld                 [r2, r3]
-    vld1.8              {q8-q9},   [r6@128]!
-    vld1.8              {q10-q11}, [r6@128], r4
-    vrhadd.u8           q0, q0, q8
-    vrhadd.u8           q1, q1, q9
-    vrhadd.u8           q2, q2, q10
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r4
-    subs                r5, r5, #1
-    bgt                 avg64_h
-    pop                 {r4-r6, pc}
-
-avg32
-    vld1.8              {q0-q1}, [r0], r1
-    vld1.8              {q2-q3}, [r0], r1
-    vld1.8              {q8-q9},   [r6@128], r3
-    vld1.8              {q10-q11}, [r6@128], r3
-    pld                 [r0]
-    vrhadd.u8           q0, q0, q8
-    pld                 [r0, r1]
-    vrhadd.u8           q1, q1, q9
-    pld                 [r6]
-    vrhadd.u8           q2, q2, q10
-    pld                 [r6, r3]
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg32
-    pop                 {r4-r6, pc}
-
-avg16
-    vld1.8              {q0}, [r0], r1
-    vld1.8              {q1}, [r0], r1
-    vld1.8              {q2}, [r6@128], r3
-    vld1.8              {q3}, [r6@128], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q2
-    pld                 [r6]
-    pld                 [r6, r3]
-    vrhadd.u8           q1, q1, q3
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg16
-    pop                 {r4-r6, pc}
-
-avg8
-    vld1.8              {d0}, [r0], r1
-    vld1.8              {d1}, [r0], r1
-    vld1.8              {d2}, [r6@64], r3
-    vld1.8              {d3}, [r6@64], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q1
-    pld                 [r6]
-    pld                 [r6, r3]
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d1}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 avg8
-    pop                 {r4-r6, pc}
-
-avg4
-    vld1.32             {d0[0]}, [r0], r1
-    vld1.32             {d0[1]}, [r0], r1
-    vld1.32             {d2[0]}, [r6@32], r3
-    vld1.32             {d2[1]}, [r6@32], r3
-    vrhadd.u8           d0, d0, d2
-    vst1.32             {d0[0]}, [r2@32], r3
-    vst1.32             {d0[1]}, [r2@32], r3
-    subs                r5, r5, #2
-    bgt                 avg4
-    pop                 {r4-r6, pc}
-    ENDP
-
-    END
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -66,64 +66,46 @@

    vld1.s16        {q0}, [r5]              ; filter_x

-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
+    add             r8, r1, r1, lsl #1      ; src_stride * 3
+    add             r8, r8, #4              ; src_stride * 3 + 4
+    rsb             r8, r8, #0              ; reset for src

-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
+    add             r4, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r4, r4, #4              ; dst_stride * 3 - 4
+    rsb             r4, r4, #0              ; reset for dst

-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
+    sub             r9, r1, #8              ; post increment for src load
+
+    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

-loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
+loop_horiz
+    vld1.8          {d24}, [r0]!
+    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
+
+    vld1.8          {d25}, [r0]!
+    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
+
+    vld1.8          {d26}, [r0]!
+    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
+
+    vld1.8          {d27}, [r0]!
+    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8

    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

-    pld             [r0, r1, lsl #2]
-
+    ; extract to s16
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
+    vtrn.32         d28, d29 ; only the first half is populated
    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
+    vmovl.u8        q13, d30

    ; slightly out of order load to match the existing data
    vld1.u32        {d6[0]}, [r2], r3
@@ -134,12 +116,10 @@ loop_horiz
    sub             r2, r2, r3, lsl #2      ; reset for store

    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
+    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
+    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
+    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -155,29 +135,24 @@ loop_horiz
    vtrn.16         d2, d3
    vtrn.32         d2, d3
    vtrn.8          d2, d3
-
+    
    ; average the new value and the dst value
    vrhadd.u8       q1, q1, q3

-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r4

    subs            r6, r6, #4              ; w -= 4
    bgt             loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r0, r0, r1              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt loop_horiz

    pop             {r4-r10, pc}

@@ -188,77 +163,66 @@ loop_horiz
    cmp             r12, #16
    bne             vp9_convolve8_avg_vert_c

-    push            {r4-r8, lr}
+    push            {r4-r10, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r7, [sp, #40]           ; filter_y
+    ldr             r8, [sp, #48]           ; w
+    ldr             r9, [sp, #52]           ; h

-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r7]              ; filter_y

-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
+    mov             r5, r1, lsl #1          ; src_stride * 2
+    add             r5, r5, r1, lsl #3      ; src_stride * 10
+    sub             r5, r5, #4              ; src_stride * 10 + 4
+    rsb             r5, r5, #0              ; reset for src

-loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
+    add             r6, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r6, r6, #4              ; dst_stride * 3 - 4
+    rsb             r6, r6, #0              ; reset for dst

-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
+    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
+    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop

+    mov             r10, r8                 ; w loop counter
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d16[0]}, [r0], r1
+    vld1.u32        {d16[1]}, [r0], r1
+    vld1.u32        {d18[0]}, [r0], r1
+    vld1.u32        {d18[1]}, [r0], r1
+    vld1.u32        {d20[0]}, [r0], r1
+    vld1.u32        {d20[1]}, [r0], r1
+    vld1.u32        {d22[0]}, [r0], r1
+    vld1.u32        {d22[1]}, [r0], r1
+    vld1.u32        {d24[0]}, [r0], r1
+    vld1.u32        {d24[1]}, [r0], r1
+    vld1.u32        {d26[0]}, [r0], r5
+
+    ; extract to s16
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

-    vld1.u32        {d6[0]}, [r5@32], r3
-    vld1.u32        {d6[1]}, [r8@32], r3
-    vld1.u32        {d7[0]}, [r5@32], r3
-    vld1.u32        {d7[1]}, [r8@32], r3
+    vld1.u32        {d6[0]}, [r2], r3
+    vld1.u32        {d6[1]}, [r2], r3
+    vld1.u32        {d7[0]}, [r2], r3
+    vld1.u32        {d7[1]}, [r2], r3

-    pld             [r7]
-    pld             [r4]
+    sub             r2, r2, r3, lsl #2      ; reset for store

    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
+    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
+    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -273,30 +237,22 @@ loop_vert
    ; average the new value and the dst value
    vrhadd.u8       q1, q1, q3

-    sub             r5, r5, r3, lsl #1      ; reset for store
-    sub             r8, r8, r3, lsl #1
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r6

-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
+    subs            r8, r8, #4              ; w -= 4
    bgt             loop_vert

    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    mov             r8, r10                 ; restore w counter
+    add             r0, r0, r7              ; src += 4 * src_stride - w
+    add             r2, r2, r12             ; dst += 4 * dst_stride - w
+    subs            r9, r9, #4              ; h -= 4
+    bgt             loop_vert

-    pop             {r4-r8, pc}
+    pop             {r4-r10, pc}

    ENDP
    END
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -66,72 +66,52 @@

    vld1.s16        {q0}, [r5]              ; filter_x

-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
+    add             r8, r1, r1, lsl #1      ; src_stride * 3
+    add             r8, r8, #4              ; src_stride * 3 + 4
+    rsb             r8, r8, #0              ; reset for src

-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
+    add             r4, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r4, r4, #4              ; dst_stride * 3 - 4
+    rsb             r4, r4, #0              ; reset for dst

-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
+    sub             r9, r1, #8              ; post increment for src load
+
+    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

-loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
+loop_horiz
+    vld1.8          {d24}, [r0]!
+    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
+
+    vld1.8          {d25}, [r0]!
+    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
+
+    vld1.8          {d26}, [r0]!
+    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
+
+    vld1.8          {d27}, [r0]!
+    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8

    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

-    pld             [r0, r1, lsl #2]
-
+    ; extract to s16
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
+    vtrn.32         d28, d29 ; only the first half is populated
    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
+    vmovl.u8        q13, d30

    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
+    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
+    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
+    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -148,25 +128,20 @@ loop_horiz
    vtrn.32         d2, d3
    vtrn.8          d2, d3

-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r4

    subs            r6, r6, #4              ; w -= 4
    bgt             loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r0, r0, r1              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt loop_horiz

    pop             {r4-r10, pc}

@@ -177,72 +152,59 @@ loop_horiz
    cmp             r12, #16
    bne             vp9_convolve8_vert_c

-    push            {r4-r8, lr}
+    push            {r4-r10, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r7, [sp, #40]           ; filter_y
+    ldr             r8, [sp, #48]           ; w
+    ldr             r9, [sp, #52]           ; h

-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r7]              ; filter_y

-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
+    mov             r5, r1, lsl #1          ; src_stride * 2
+    add             r5, r5, r1, lsl #3      ; src_stride * 10
+    sub             r5, r5, #4              ; src_stride * 10 + 4
+    rsb             r5, r5, #0              ; reset for src

-loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
+    add             r6, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r6, r6, #4              ; dst_stride * 3 - 4
+    rsb             r6, r6, #0              ; reset for dst

-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
+    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
+    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop

+    mov             r10, r8                 ; w loop counter
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d16[0]}, [r0], r1
+    vld1.u32        {d16[1]}, [r0], r1
+    vld1.u32        {d18[0]}, [r0], r1
+    vld1.u32        {d18[1]}, [r0], r1
+    vld1.u32        {d20[0]}, [r0], r1
+    vld1.u32        {d20[1]}, [r0], r1
+    vld1.u32        {d22[0]}, [r0], r1
+    vld1.u32        {d22[1]}, [r0], r1
+    vld1.u32        {d24[0]}, [r0], r1
+    vld1.u32        {d24[1]}, [r0], r1
+    vld1.u32        {d26[0]}, [r0], r5
+
+    ; extract to s16
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

-    pld             [r5]
-    pld             [r8]
-
    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
+    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
+    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -254,27 +216,22 @@ loop_vert
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2

-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r6

-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
+    subs            r8, r8, #4              ; w -= 4
    bgt             loop_vert

    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    mov             r8, r10                 ; restore w counter
+    add             r0, r0, r7              ; src += 4 * src_stride - w
+    add             r2, r2, r12             ; dst += 4 * dst_stride - w
+    subs            r9, r9, #4              ; h -= 4
+    bgt             loop_vert

-    pop             {r4-r8, pc}
+    pop             {r4-r10, pc}

    ENDP
    END
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -10,7 +10,6 @@

 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
-#include "vpx_ports/mem.h"

 void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
                        uint8_t *dst, ptrdiff_t dst_stride,
@@ -20,7 +19,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
   */
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+  uint8_t temp[64 * 72];

  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;
@@ -54,7 +53,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+  uint8_t temp[64 * 72];
  int intermediate_height = h + 7;

  if (x_step_q4 != 16 || y_step_q4 != 16)
--- a/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/vp9/common/arm/neon/vp9_copy_neon.asm
@@ -1,84 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_convolve_copy_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_convolve_copy_neon| PROC
-    push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
-
-    cmp                 r4, #32
-    bgt                 copy64
-    beq                 copy32
-    cmp                 r4, #8
-    bgt                 copy16
-    beq                 copy8
-    b                   copy4
-
-copy64
-    sub                 lr, r1, #32
-    sub                 r3, r3, #32
-copy64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #1
-    bgt                 copy64_h
-    pop                 {r4-r5, pc}
-
-copy32
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q2-q3}, [r0], r1
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy32
-    pop                 {r4-r5, pc}
-
-copy16
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q1}, [r0], r1
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy16
-    pop                 {r4-r5, pc}
-
-copy8
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d2}, [r0], r1
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d2}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 copy8
-    pop                 {r4-r5, pc}
-
-copy4
-    ldr                 r12, [r0], r1
-    str                 r12, [r2], r3
-    subs                r5, r5, #1
-    bgt                 copy4
-    pop                 {r4-r5, pc}
-    ENDP
-
-    END
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -1,169 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-
-void vp9_short_idct16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
-
-  // save d8-d15 register values.
-  save_neon_registers();
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the lower 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
-
-  // restore d8-d15 register values.
-  restore_neon_registers();
-
-  return;
-}
-
-void vp9_short_idct10_16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
-
-  // save d8-d15 register values.
-  save_neon_registers();
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vp9_short_idct10_16x16_add_neon_pass2(input+1,
-                                        row_idct_output,
-                                        pass1_output,
-                                        0,
-                                        dest,
-                                        dest_stride);
-
-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
-
-  // restore d8-d15 register values.
-  restore_neon_registers();
-
-  return;
-}
--- a/vp9/common/arm/neon/vp9_idct32x32_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_neon.c
@@ -1,47 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_common.h"
-
-// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
-extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
-                                           int16_t *output, int16_t *input);
-extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-
-
-// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  // TODO(cd): move the creation of these buffers within the ASM file
-  // internal buffer used to transpose 8 lines into before transforming them
-  int16_t transpose_buffer[32 * 8];
-  // results of the first pass (transpose and transform rows)
-  int16_t pass1[32 * 32];
-  // results of the second pass (transpose and transform columns)
-  int16_t pass2[32 * 32];
-
-  // save register we need to preserve
-  save_neon_registers();
-  // process rows
-  idct32_transpose_and_transform(transpose_buffer, pass1, input);
-  // process columns
-  // TODO(cd): do these two steps/passes within the ASM file
-  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
-  // combine and add to dest
-  // TODO(cd): integrate this within the last storage step of the second pass
-  idct32_combine_add(dest, pass2, dest_stride);
-  // restore register we need to preserve
-  restore_neon_registers();
-}
-
-// TODO(cd): Eliminate this file altogether when everything is in ASM file
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -361,6 +361,8 @@ v_end

    vand        d16, d20, d19              ; flat && mask
    vmov        r5, r6, d16
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch

    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
@@ -386,11 +388,10 @@ v_end

    vmov.u8     d22, #0x80

-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
-
    vand        d17, d18, d16              ; flat2 && flat && mask
    vmov        r5, r6, d17
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch

    ; mbfilter() function

@@ -404,10 +405,15 @@ v_end
    vmov.u8     d27, #3

    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+
    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+
    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+
    vand        d29, d29, d21              ; filter &= hev
+
    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+
    vmov.u8     d29, #4

    ; filter = clamp(filter + 3 * ( qs0 - ps0))
@@ -446,37 +452,37 @@ v_end
    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
    vqrshrn.u16 d18, q15, #3               ; r_op2

-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
+    vsubw.u8    q15, d4                    ; op1 = op2 - p3
+    vsubw.u8    q15, d5                    ; op1 -= p2
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d9                    ; op1 += q1
    vqrshrn.u16 d19, q15, #3               ; r_op1

-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
+    vsubw.u8    q15, d4                    ; op0 = op1 - p3
+    vsubw.u8    q15, d6                    ; op0 -= p1
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d10                   ; op0 += q2
    vqrshrn.u16 d20, q15, #3               ; r_op0

    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d11                   ; oq0 += q3
    vqrshrn.u16 d21, q15, #3               ; r_oq0

    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddw.u8    q15, d11                   ; oq1 += q3
    vqrshrn.u16 d22, q15, #3               ; r_oq1

    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vadd.i16    q15, q14
+    vaddw.u8    q15, d10                   ; oq2 += q2
+    vaddw.u8    q15, d11                   ; oq2 += q3
    vqrshrn.u16 d27, q15, #3               ; r_oq2

    ; Filter does not set op2 or oq2, so use p2 and q2.
@@ -495,104 +501,113 @@ v_end
    ; wide_mbfilter flat2 && flat && mask branch
    vmov.u8     d16, #7
    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
+    vmlal.u8    q15, d1, d29               ; op6 += p6 * 2
+    vaddw.u8    q15, d2                    ; op6 += p5
+    vaddw.u8    q15, d3                    ; op6 += p4
+    vaddw.u8    q15, d4                    ; op6 += p3
+    vaddw.u8    q15, d5                    ; op6 += p2
+    vaddw.u8    q15, d6                    ; op6 += p1
    vqrshrn.u16 d16, q15, #4               ; w_op6

-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
+    vsubw.u8    q15, d0                    ; op5 = op6 - p7
+    vsubw.u8    q15, d1                    ; op5 -= p6
+    vaddw.u8    q15, d2                    ; op5 += p5
+    vaddw.u8    q15, d9                    ; op5 += q1
    vqrshrn.u16 d24, q15, #4               ; w_op5

-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
+    vsubw.u8    q15, d0                    ; op4 = op5 - p7
+    vsubw.u8    q15, d2                    ; op4 -= p5
+    vaddw.u8    q15, d3                    ; op4 += p4
+    vaddw.u8    q15, d10                   ; op4 += q2
    vqrshrn.u16 d25, q15, #4               ; w_op4

-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
+    vsubw.u8    q15, d0                    ; op3 = op4 - p7
+    vsubw.u8    q15, d3                    ; op3 -= p4
+    vaddw.u8    q15, d4                    ; op3 += p3
+    vaddw.u8    q15, d11                   ; op3 += q3
    vqrshrn.u16 d26, q15, #4               ; w_op3

-    vaddw.u8    q15, q14, d5               ; op2 += p2
-    vaddl.u8    q14, d0, d5
+    vsubw.u8    q15, d0                    ; op2 = op3 - p7
+    vsubw.u8    q15, d4                    ; op2 -= p3
+    vaddw.u8    q15, d5                    ; op2 += p2
    vaddw.u8    q15, d12                   ; op2 += q4
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
    vqrshrn.u16 d27, q15, #4               ; w_op2

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; op1 = op2 - p7
+    vsubw.u8    q15, d5                    ; op1 -= p2
    vaddw.u8    q15, d6                    ; op1 += p1
    vaddw.u8    q15, d13                   ; op1 += q5
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
    vqrshrn.u16 d18, q15, #4               ; w_op1

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; op0 = op1 - p7
+    vsubw.u8    q15, d6                    ; op0 -= p1
    vaddw.u8    q15, d7                    ; op0 += p0
    vaddw.u8    q15, d14                   ; op0 += q6
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
    vqrshrn.u16 d19, q15, #4               ; w_op0

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; oq0 = op0 - p7
+    vsubw.u8    q15, d7                    ; oq0 -= p0
    vaddw.u8    q15, d8                    ; oq0 += q0
    vaddw.u8    q15, d15                   ; oq0 += q7
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
    vqrshrn.u16 d20, q15, #4               ; w_oq0

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                   ; oq1 += q7
    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d1                    ; oq1 = oq0 - p6
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddw.u8    q15, d15                   ; oq1 += q7
    vqrshrn.u16 d21, q15, #4               ; w_oq1

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d2                    ; oq2 = oq1 - p5
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vaddw.u8    q15, d10                   ; oq2 += q2
+    vaddw.u8    q15, d15                   ; oq2 += q7
    vqrshrn.u16 d22, q15, #4               ; w_oq2

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d3                    ; oq3 = oq2 - p4
+    vsubw.u8    q15, d10                   ; oq3 -= q2
+    vaddw.u8    q15, d11                   ; oq3 += q3
+    vaddw.u8    q15, d15                   ; oq3 += q7
    vqrshrn.u16 d23, q15, #4               ; w_oq3

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d4                    ; oq4 = oq3 - p3
+    vsubw.u8    q15, d11                   ; oq4 -= q3
+    vaddw.u8    q15, d12                   ; oq4 += q4
+    vaddw.u8    q15, d15                   ; oq4 += q7
    vqrshrn.u16 d1, q15, #4                ; w_oq4

-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d5                    ; oq5 = oq4 - p2
+    vsubw.u8    q15, d12                   ; oq5 -= q4
+    vaddw.u8    q15, d13                   ; oq5 += q5
+    vaddw.u8    q15, d15                   ; oq5 += q7
    vqrshrn.u16 d2, q15, #4                ; w_oq5

-    vsub.i16    q15, q14
    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d6                    ; oq6 = oq5 - p1
+    vsubw.u8    q15, d13                   ; oq6 -= q5
+    vaddw.u8    q15, d14                   ; oq6 += q6
+    vaddw.u8    q15, d15                   ; oq6 += q7
    vqrshrn.u16 d3, q15, #4                ; w_oq6
+
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -1,198 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_short_idct16x16_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-;                                    int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct16x16_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asr              r0, r0, #6                ; >> 6
-
-    vdup.s16         q0, r0                    ; duplicate a1
-    mov              r0, #8
-    sub              r2, #8
-
-    ; load destination data row0 - row3
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row4 - row7
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row8 - row11
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row12 - row15
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vp9_short_idct16x16_1_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -1,68 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_short_idct4x4_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct4x4_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    vld1.32          {d2[0]}, [r1], r2
-    vld1.32          {d2[1]}, [r1], r2
-    vld1.32          {d4[0]}, [r1], r2
-    vld1.32          {d4[1]}, [r1]
-
-    vaddw.u8         q8, q0, d2                ; dest[x] + a1
-    vaddw.u8         q9, q0, d4
-
-    vqmovun.s16      d6, q8                    ; clip_pixel
-    vqmovun.s16      d7, q9
-
-    vst1.32          {d6[0]}, [r12], r2
-    vst1.32          {d6[1]}, [r12], r2
-    vst1.32          {d7[0]}, [r12], r2
-    vst1.32          {d7[1]}, [r12]
-
-    bx               lr
-    ENDP             ; |vp9_short_idct4x4_1_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -1,190 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_idct4x4_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct4x4_add_neon| PROC
-
-    ; The 2D transform is done with two passes which are actually pretty
-    ; similar. We first transform the rows. This is done by transposing
-    ; the inputs, doing an SIMD column transform (the columns are the
-    ; transposed rows) and then transpose the results (so that it goes back
-    ; in normal/row positions). Then, we transform the columns by doing
-    ; another SIMD column transform.
-    ; So, two passes of a transpose followed by a column transform.
-
-    ; load the inputs into q8-q9, d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-
-    ; generate scalar constants
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov             r0, #0x3b00
-    add             r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov             r3, #0x2d00
-    add             r3, #0x41
-    ; cospi_24_64 = 6270 = 0x 187e
-    mov             r12, #0x1800
-    add             r12, #0x7e
-
-    ; transpose the input data
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-
-    ; generate constant vectors
-    vdup.16         d20, r0         ; replicate cospi_8_64
-    vdup.16         d21, r3         ; replicate cospi_16_64
-
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    vdup.16         d22, r12        ; replicate cospi_24_64
-
-    ; do the transform on transposed rows
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-    vswp     d18, d19
-
-    ; transpose the results
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    ; do the transform on columns
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-
-    ; The results are in two registers, one of them being swapped. This will
-    ; be taken care of by loading the 'dest' value in a swapped fashion and
-    ; also storing them in the same swapped fashion.
-    ; temp_out[0, 1] = d16, d17 = q8
-    ; temp_out[2, 3] = d19, d18 = q9 swapped
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16 q8, q8, #4
-    vrshr.s16 q9, q9, #4
-
-    vld1.32 {d26[0]}, [r1], r2
-    vld1.32 {d26[1]}, [r1], r2
-    vld1.32 {d27[1]}, [r1], r2
-    vld1.32 {d27[0]}, [r1]  ; no post-increment
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8 q8, q8, d26
-    vaddw.u8 q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb r2, r2, #0
-    vst1.32 {d27[0]}, [r1], r2
-    vst1.32 {d27[1]}, [r1], r2
-    vst1.32 {d26[1]}, [r1], r2
-    vst1.32 {d26[0]}, [r1]  ; no post-increment
-    bx              lr
-    ENDP  ; |vp9_short_idct4x4_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -1,88 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_short_idct8x8_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct8x8_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 5)
-    add              r0, r0, #16               ; + (1 <<((5) - 1))
-    asr              r0, r0, #5                ; >> 5
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    ; load destination data
-    vld1.64          {d2}, [r1], r2
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r2
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r2
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r2
-    vld1.64          {d17}, [r1]
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vp9_short_idct8x8_1_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -9,7 +9,6 @@
 ;

    EXPORT  |vp9_short_idct8x8_add_neon|
-    EXPORT  |vp9_short_idct10_8x8_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -25,149 +24,191 @@
    ; stage 1
    vdup.16         d0, r3                    ; duplicate cospi_28_64
    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64

    ; input[1] * cospi_28_64
    vmull.s16       q2, d18, d0
    vmull.s16       q3, d19, d0

-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
+    ; input[7] * cospi_4_64
+    vmull.s16       q4, d30, d1
+    vmull.s16       q5, d31, d1

    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
+    vsub.s32        q6, q2, q4
+    vsub.s32        q7, q3, q5

    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
+    vqrshrn.s32     d8, q6, #14               ; >> 14
+    vqrshrn.s32     d9, q7, #14               ; >> 14

    ; input[1] * cospi_4_64
    vmull.s16       q2, d18, d1
    vmull.s16       q3, d19, d1

-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
+    ; input[7] * cospi_28_64
+    vmull.s16       q1, d30, d0
+    vmull.s16       q5, d31, d0

    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
+    vadd.s32        q2, q2, q1
+    vadd.s32        q3, q3, q5

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d14, q2, #14              ; >> 14
    vqrshrn.s32     d15, q3, #14              ; >> 14

-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
+    vdup.16         d0, r5                    ; duplicate cospi_12_64
+    vdup.16         d1, r6                    ; duplicate cospi_20_64
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q2, d26, d0
+    vmull.s16       q3, d27, d0
+
+    ; input[3] * cospi_20_64
+    vmull.s16       q5, d22, d1
+    vmull.s16       q6, d23, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vsub.s32        q2, q2, q5
+    vsub.s32        q3, q3, q6

    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14              ; >> 14
+    vqrshrn.s32     d10, q2, #14              ; >> 14
+    vqrshrn.s32     d11, q3, #14              ; >> 14
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q2, d26, d1
+    vmull.s16       q3, d27, d1
+
+    ; input[3] * cospi_12_64
+    vmull.s16       q9, d22, d0
+    vmull.s16       q15, d23, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vadd.s32        q0, q2, q9
+    vadd.s32        q1, q3, q15
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q0, #14              ; >> 14
+    vqrshrn.s32     d13, q1, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64

    ; input[0] * cospi_16_64
    vmull.s16       q2, d16, d0
    vmull.s16       q3, d17, d0

-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
+    ; input[2] * cospi_16_64
+    vmull.s16       q9,  d24, d0
+    vmull.s16       q11, d25, d0

    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
+    vadd.s32        q9, q2, q9
+    vadd.s32        q11, q3, q11
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q9, #14              ; >> 14
+    vqrshrn.s32     d19, q11, #14             ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[2] * cospi_16_64
+    vmull.s16       q13,  d24, d0
+    vmull.s16       q15, d25, d0

    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
+    vsub.s32        q2, q2, q13
+    vsub.s32        q3, q3, q15

+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q2, #14              ; >> 14
+    vqrshrn.s32     d23, q3, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
    vdup.16         d0, r8                    ; duplicate cospi_24_64
    vdup.16         d1, r9                    ; duplicate cospi_8_64

-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14              ; >> 14
-    vqrshrn.s32     d23, q15, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
    ; input[1] * cospi_24_64
    vmull.s16       q2, d20, d0
    vmull.s16       q3, d21, d0

-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
+    ; input[3] * cospi_8_64
+    vmull.s16       q13, d28, d1
+    vmull.s16       q15, d29, d1

    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
+    vsub.s32        q2, q2, q13
+    vsub.s32        q3, q3, q15

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d26, q2, #14              ; >> 14
    vqrshrn.s32     d27, q3, #14              ; >> 14

+    ; input[1] * cospi_8_64
+    vmull.s16       q2, d20, d1
+    vmull.s16       q3, d21, d1
+
+    ; input[3] * cospi_24_64
+    vmull.s16       q8, d28, d0
+    vmull.s16       q10, d29, d0
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vadd.s32        q0, q2, q8
+    vadd.s32        q1, q3, q10
+
    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14              ; >> 14
+    vqrshrn.s32     d30, q0, #14              ; >> 14
+    vqrshrn.s32     d31, q1, #14              ; >> 14
+

    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]

-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
    ; stage 2 - odd half
    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]

+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
    ; step2[6] * cospi_16_64
    vmull.s16       q9, d28, d16
    vmull.s16       q10, d29, d16

-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16

    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
+    vsub.s32        q9, q9, q11
+    vsub.s32        q10, q10, q12

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d10, q9, #14              ; >> 14
    vqrshrn.s32     d11, q10, #14             ; >> 14

+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14

    ; stage 4
    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
@@ -206,11 +247,14 @@

 |vp9_short_idct8x8_add_neon| PROC
    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
+    vld1.s16        {q8}, [r0]!
+    vld1.s16        {q9}, [r0]!
+    vld1.s16        {q10}, [r0]!
+    vld1.s16        {q11}, [r0]!
+    vld1.s16        {q12}, [r0]!
+    vld1.s16        {q13}, [r0]!
+    vld1.s16        {q14}, [r0]!
+    vld1.s16        {q15}, [r0]!

    ; transpose the input data
    TRANSPOSE8X8
@@ -305,215 +349,8 @@
    vst1.64         {d6}, [r0], r2
    vst1.64         {d7}, [r0], r2

-    vpop            {d8-d15}
    pop             {r4-r9}
    bx              lr
    ENDP  ; |vp9_short_idct8x8_add_neon|

-;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct10_8x8_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    ; stage 1
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
-    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
-    ; to double the constants before multiplying to compensate this.
-    mov             r12, r3, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
-    mov             r12, r4, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_28_64)
-    vqrdmulh.s16    q4, q9, q0
-
-    mov             r12, r6, lsl #1
-    rsb             r12, #0
-    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_4_64)
-    vqrdmulh.s16    q7, q9, q1
-
-    mov             r12, r5, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
-
-    ; dct_const_round_shift(- input[3] * cospi_20_64)
-    vqrdmulh.s16    q5, q11, q0
-
-    mov             r12, r7, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
-
-    ; dct_const_round_shift(input[3] * cospi_12_64)
-    vqrdmulh.s16    q6, q11, q1
-
-    ; stage 2 & stage 3 - even half
-    mov             r12, r8, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrdmulh.s16    q9, q8, q0
-
-    mov             r12, r9, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_24_64)
-    vqrdmulh.s16    q13, q10, q1
-
-    ; dct_const_round_shift(input[1] * cospi_8_64)
-    vqrdmulh.s16    q15, q10, q0
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vp9_short_idct10_8x8_add_neon|
-
    END
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_iht4x4_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
-    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
-    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
-    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
-    ; them as buffer during calculation.
-    MACRO
-    IDCT4x4_1D
-    ; stage 1
-    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
-    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
-
-    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
-    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
-    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
-    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q10, #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16    q8,  q13, q14
-    vsub.s16    q9,  q13, q14
-    vswp        d18, d19
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
-    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
-    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
-    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
-    ; q14,q15 registers and use them as buffer during calculation.
-    MACRO
-    IADST4x4_1D
-    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
-    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
-    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
-    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
-    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
-    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
-    vaddw.s16   q15, q15, d19   ; x0 + x3
-    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
-    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
-    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
-
-    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
-    vadd.s32    q10, q10, q8
-    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
-    vdup.32     q8, r0          ; duplicate sinpi_3_9
-    vsub.s32    q11, q11, q9
-    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
-
-    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
-    vadd.s32    q10, q10, q11   ; x0 + x1
-    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
-    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d16, q13, #14
-    vqrshrn.s32 d17, q14, #14
-    vqrshrn.s32 d18, q15, #14
-    vqrshrn.s32 d19, q10, #14
-    MEND
-
-    ; Generate cosine constants in d6 - d8 for the IDCT
-    MACRO
-    GENERATE_COSINE_CONSTANTS
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov         r0, #0x3b00
-    add         r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov         r3, #0x2d00
-    add         r3, #0x41
-    ; cospi_24_64 = 6270 = 0x187e
-    mov         r12, #0x1800
-    add         r12, #0x7e
-
-    ; generate constant vectors
-    vdup.16     d0, r0          ; duplicate cospi_8_64
-    vdup.16     d1, r3          ; duplicate cospi_16_64
-    vdup.16     d2, r12         ; duplicate cospi_24_64
-    MEND
-
-    ; Generate sine constants in d1 - d4 for the IADST.
-    MACRO
-    GENERATE_SINE_CONSTANTS
-    ; sinpi_1_9 = 5283 = 0x14A3
-    mov         r0, #0x1400
-    add         r0, #0xa3
-    ; sinpi_2_9 = 9929 = 0x26C9
-    mov         r3, #0x2600
-    add         r3, #0xc9
-    ; sinpi_4_9 = 15212 = 0x3B6C
-    mov         r12, #0x3b00
-    add         r12, #0x6c
-
-    ; generate constant vectors
-    vdup.16     d3, r0          ; duplicate sinpi_1_9
-
-    ; sinpi_3_9 = 13377 = 0x3441
-    mov         r0, #0x3400
-    add         r0, #0x41
-
-    vdup.16     d4, r3          ; duplicate sinpi_2_9
-    vdup.16     d5, r12         ; duplicate sinpi_4_9
-    vdup.16     q3, r0          ; duplicate sinpi_3_9
-    MEND
-
-    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
-    MACRO
-    TRANSPOSE4X4
-    vtrn.16     d16, d17
-    vtrn.16     d18, d19
-    vtrn.32     q8, q9
-    MEND
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht4x4_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16    {q8,q9}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE4X4
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IDCT4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-    b end_vp9_short_iht4x4_add_neon
-
-idct_iadst
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IDCT4x4_1D
-
-    b end_vp9_short_iht4x4_add_neon
-
-iadst_iadst
-    ; generate constants
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-end_vp9_short_iht4x4_add_neon
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16   q8, q8, #4
-    vrshr.s16   q9, q9, #4
-
-    vld1.32     {d26[0]}, [r1], r2
-    vld1.32     {d26[1]}, [r1], r2
-    vld1.32     {d27[0]}, [r1], r2
-    vld1.32     {d27[1]}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8    q8, q8, d26
-    vaddw.u8    q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb         r2, r2, #0
-    vst1.32     {d27[1]}, [r1], r2
-    vst1.32     {d27[0]}, [r1], r2
-    vst1.32     {d26[1]}, [r1], r2
-    vst1.32     {d26[0]}, [r1]  ; no post-increment
-    bx          lr
-    ENDP  ; |vp9_short_iht4x4_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -1,696 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_iht8x8_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Generate IADST constants in r0 - r12 for the IADST.
-    MACRO
-    GENERATE_IADST_CONSTANTS
-    ; generate  cospi_2_64  = 16305
-    mov             r0, #0x3f00
-    add             r0, #0xb1
-
-    ; generate cospi_30_64 = 1606
-    mov             r1, #0x600
-    add             r1, #0x46
-
-    ; generate cospi_10_64 = 14449
-    mov             r2, #0x3800
-    add             r2, #0x71
-
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_18_64 = 10394
-    mov             r4, #0x2800
-    add             r4, #0x9a
-
-    ; generate cospi_14_64 = 12665
-    mov             r5, #0x3100
-    add             r5, #0x79
-
-    ; generate cospi_26_64 = 4756
-    mov             r6, #0x1200
-    add             r6, #0x94
-
-    ; generate cospi_6_64  = 15679
-    mov             r7, #0x3d00
-    add             r7, #0x3f
-
-    ; generate cospi_8_64  = 15137
-    mov             r8, #0x3b00
-    add             r8, #0x21
-
-    ; generate cospi_24_64 = 6270
-    mov             r9, #0x1800
-    add             r9, #0x7e
-
-    ; generate 0
-    mov             r10, #0
-
-    ; generate  cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-    MEND
-
-    ; Generate IDCT constants in r3 - r9 for the IDCT.
-    MACRO
-    GENERATE_IDCT_CONSTANTS
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-    MEND
-
-    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
-    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
-    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
-    ; registers and use them as buffer during calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14             ; >> 14
-    vqrshrn.s32     d23, q15, #14             ; >> 14
-
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14             ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
-    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
-    ; output will be stored back into q8-q15 registers. This macro will touch
-    ; q0 - q7 registers and use them as buffer during calculation.
-    MACRO
-    IADST8X8_1D
-    vdup.16         d14, r0                   ; duplicate cospi_2_64
-    vdup.16         d15, r1                   ; duplicate cospi_30_64
-
-    ; cospi_2_64  * x0
-    vmull.s16       q1, d30, d14
-    vmull.s16       q2, d31, d14
-
-    ; cospi_30_64 * x0
-    vmull.s16       q3, d30, d15
-    vmull.s16       q4, d31, d15
-
-    vdup.16         d30, r4                   ; duplicate cospi_18_64
-    vdup.16         d31, r5                   ; duplicate cospi_14_64
-
-    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-    vmlal.s16       q1, d16, d15
-    vmlal.s16       q2, d17, d15
-
-    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
-    vmlsl.s16       q3, d16, d14
-    vmlsl.s16       q4, d17, d14
-
-    ; cospi_18_64 * x4
-    vmull.s16       q5, d22, d30
-    vmull.s16       q6, d23, d30
-
-    ; cospi_14_64 * x4
-    vmull.s16       q7, d22, d31
-    vmull.s16       q8, d23, d31
-
-    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-    vmlal.s16       q5, d24, d31
-    vmlal.s16       q6, d25, d31
-
-    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
-    vmlsl.s16       q7, d24, d30
-    vmlsl.s16       q8, d25, d30
-
-    ; (s0 + s4)
-    vadd.s32        q11, q1, q5
-    vadd.s32        q12, q2, q6
-
-    vdup.16         d0, r2                   ; duplicate cospi_10_64
-    vdup.16         d1, r3                   ; duplicate cospi_22_64
-
-    ; (s0 - s4)
-    vsub.s32        q1, q1, q5
-    vsub.s32        q2, q2, q6
-
-    ; x0 = dct_const_round_shift(s0 + s4);
-    vqrshrn.s32     d22, q11, #14             ; >> 14
-    vqrshrn.s32     d23, q12, #14             ; >> 14
-
-    ; (s1 + s5)
-    vadd.s32        q12, q3, q7
-    vadd.s32        q15, q4, q8
-
-    ; (s1 - s5)
-    vsub.s32        q3, q3, q7
-    vsub.s32        q4, q4, q8
-
-    ; x4 = dct_const_round_shift(s0 - s4);
-    vqrshrn.s32     d2, q1, #14               ; >> 14
-    vqrshrn.s32     d3, q2, #14               ; >> 14
-
-    ; x1 = dct_const_round_shift(s1 + s5);
-    vqrshrn.s32     d24, q12, #14             ; >> 14
-    vqrshrn.s32     d25, q15, #14             ; >> 14
-
-    ; x5 = dct_const_round_shift(s1 - s5);
-    vqrshrn.s32     d6, q3, #14               ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; cospi_10_64 * x2
-    vmull.s16       q4, d26, d0
-    vmull.s16       q5, d27, d0
-
-    ; cospi_22_64 * x2
-    vmull.s16       q2, d26, d1
-    vmull.s16       q6, d27, d1
-
-    vdup.16         d30, r6                   ; duplicate cospi_26_64
-    vdup.16         d31, r7                   ; duplicate cospi_6_64
-
-    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-    vmlal.s16       q4, d20, d1
-    vmlal.s16       q5, d21, d1
-
-    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-    vmlsl.s16       q2, d20, d0
-    vmlsl.s16       q6, d21, d0
-
-    ; cospi_26_64 * x6
-    vmull.s16       q0, d18, d30
-    vmull.s16       q13, d19, d30
-
-    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-    vmlal.s16       q0, d28, d31
-    vmlal.s16       q13, d29, d31
-
-    ; cospi_6_64  * x6
-    vmull.s16       q10, d18, d31
-    vmull.s16       q9, d19, d31
-
-    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-    vmlsl.s16       q10, d28, d30
-    vmlsl.s16       q9, d29, d30
-
-    ; (s3 + s7)
-    vadd.s32        q14, q2, q10
-    vadd.s32        q15, q6, q9
-
-    ; (s3 - s7)
-    vsub.s32        q2, q2, q10
-    vsub.s32        q6, q6, q9
-
-    ; x3 = dct_const_round_shift(s3 + s7);
-    vqrshrn.s32     d28, q14, #14             ; >> 14
-    vqrshrn.s32     d29, q15, #14             ; >> 14
-
-    ; x7 = dct_const_round_shift(s3 - s7);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; (s2 + s6)
-    vadd.s32        q9, q4, q0
-    vadd.s32        q10, q5, q13
-
-    ; (s2 - s6)
-    vsub.s32        q4, q4, q0
-    vsub.s32        q5, q5, q13
-
-    vdup.16         d30, r8                   ; duplicate cospi_8_64
-    vdup.16         d31, r9                   ; duplicate cospi_24_64
-
-    ; x2 = dct_const_round_shift(s2 + s6);
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q10, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s2 - s6);
-    vqrshrn.s32     d8, q4, #14               ; >> 14
-    vqrshrn.s32     d9, q5, #14               ; >> 14
-
-    ; cospi_8_64  * x4
-    vmull.s16       q5, d2, d30
-    vmull.s16       q6, d3, d30
-
-    ; cospi_24_64 * x4
-    vmull.s16       q7, d2, d31
-    vmull.s16       q0, d3, d31
-
-    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-    vmlal.s16       q5, d6, d31
-    vmlal.s16       q6, d7, d31
-
-    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-    vmlsl.s16       q7, d6, d30
-    vmlsl.s16       q0, d7, d30
-
-    ; cospi_8_64  * x7
-    vmull.s16       q1, d4, d30
-    vmull.s16       q3, d5, d30
-
-    ; cospi_24_64 * x7
-    vmull.s16       q10, d4, d31
-    vmull.s16       q2, d5, d31
-
-    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-    vmlsl.s16       q1, d8, d31
-    vmlsl.s16       q3, d9, d31
-
-    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-    vmlal.s16       q10, d8, d30
-    vmlal.s16       q2, d9, d30
-
-    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
-
-    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
-
-    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
-
-    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
-
-    ; (s4 + s6)
-    vadd.s32        q14, q5, q1
-    vadd.s32        q15, q6, q3
-
-    ; (s4 - s6)
-    vsub.s32        q5, q5, q1
-    vsub.s32        q6, q6, q3
-
-    ; x4 = dct_const_round_shift(s4 + s6);
-    vqrshrn.s32     d18, q14, #14             ; >> 14
-    vqrshrn.s32     d19, q15, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s4 - s6);
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; (s5 + s7)
-    vadd.s32        q1, q7, q10
-    vadd.s32        q3, q0, q2
-
-    ; (s5 - s7))
-    vsub.s32        q7, q7, q10
-    vsub.s32        q0, q0, q2
-
-    ; x5 = dct_const_round_shift(s5 + s7);
-    vqrshrn.s32     d28, q1, #14               ; >> 14
-    vqrshrn.s32     d29, q3, #14               ; >> 14
-
-    ; x7 = dct_const_round_shift(s5 - s7);
-    vqrshrn.s32     d14, q7, #14              ; >> 14
-    vqrshrn.s32     d15, q0, #14              ; >> 14
-
-    vdup.16         d30, r12                  ; duplicate cospi_16_64
-
-    ; cospi_16_64 * x2
-    vmull.s16       q2, d22, d30
-    vmull.s16       q3, d23, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q13, d22, d30
-    vmull.s16       q1, d23, d30
-
-    ; cospi_16_64 * x2 + cospi_16_64  * x3;
-    vmlal.s16       q2, d24, d30
-    vmlal.s16       q3, d25, d30
-
-    ; cospi_16_64 * x2 - cospi_16_64  * x3;
-    vmlsl.s16       q13, d24, d30
-    vmlsl.s16       q1, d25, d30
-
-    ; x2 = dct_const_round_shift(s2);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q3, #14               ; >> 14
-
-    ;x3 = dct_const_round_shift(s3);
-    vqrshrn.s32     d24, q13, #14             ; >> 14
-    vqrshrn.s32     d25, q1, #14              ; >> 14
-
-    ; cospi_16_64 * x6
-    vmull.s16       q13, d10, d30
-    vmull.s16       q1, d11, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q11, d10, d30
-    vmull.s16       q0, d11, d30
-
-    ; cospi_16_64 * x6 + cospi_16_64  * x7;
-    vmlal.s16       q13, d14, d30
-    vmlal.s16       q1, d15, d30
-
-    ; cospi_16_64 * x6 - cospi_16_64  * x7;
-    vmlsl.s16       q11, d14, d30
-    vmlsl.s16       q0, d15, d30
-
-    ; x6 = dct_const_round_shift(s6);
-    vqrshrn.s32     d20, q13, #14             ; >> 14
-    vqrshrn.s32     d21, q1, #14              ; >> 14
-
-    ;x7 = dct_const_round_shift(s7);
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q0, #14              ; >> 14
-
-    vdup.16         q5, r10                   ; duplicate 0
-
-    vsub.s16        q9, q5, q9                ; output[1] = -x4;
-    vsub.s16        q11, q5, q2               ; output[3] = -x2;
-    vsub.s16        q13, q5, q6               ; output[5] = -x7;
-    vsub.s16        q15, q5, q4               ; output[7] = -x1;
-    MEND
-
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht8x8_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    push            {r0-r10}
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; first transform rows
-    IDCT8x8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; then transform columns
-    IADST8X8_1D
-
-    b end_vp9_short_iht8x8_add_neon
-
-idct_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; then transform columns
-    IDCT8x8_1D
-
-    b end_vp9_short_iht8x8_add_neon
-
-iadst_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; then transform columns
-    IADST8X8_1D
-
-end_vp9_short_iht8x8_add_neon
-    pop            {r0-r10}
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-    bx          lr
-    ENDP  ; |vp9_short_iht8x8_add_neon|
-
-    END
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -13,7 +13,6 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"

-void vp9_machine_specific_config(VP9_COMMON *cm) {
-  (void)cm;
+void vp9_machine_specific_config(VP9_COMMON *ctx) {
  vp9_rtcd();
 }
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -31,30 +31,40 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
    vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));
 }

-void vp9_free_frame_buffers(VP9_COMMON *cm) {
+void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) {
+  int i, j;
+
+  // For each in image mode_info element set the in image flag to 1
+  for (i = 0; i < cm->mi_rows; i++) {
+    MODE_INFO *ptr = mi;
+    for (j = 0; j < cm->mi_cols; j++) {
+      ptr->mbmi.mb_in_image = 1;
+      ptr++;  // Next element in the row
+    }
+
+    // Step over border element at start of next row
+    mi += cm->mode_info_stride;
+  }
+}
+
+void vp9_free_frame_buffers(VP9_COMMON *oci) {
  int i;

  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp9_free_frame_buffer(&cm->yv12_fb[i]);
+    vp9_free_frame_buffer(&oci->yv12_fb[i]);

-  vp9_free_frame_buffer(&cm->post_proc_buffer);
+  vp9_free_frame_buffer(&oci->post_proc_buffer);

-  vpx_free(cm->mip);
-  vpx_free(cm->prev_mip);
-  vpx_free(cm->above_seg_context);
-  vpx_free(cm->last_frame_seg_map);
-  vpx_free(cm->mi_grid_base);
-  vpx_free(cm->prev_mi_grid_base);
+  vpx_free(oci->mip);
+  vpx_free(oci->prev_mip);
+  vpx_free(oci->above_seg_context);

-  vpx_free(cm->above_context[0]);
+  vpx_free(oci->above_context[0]);
  for (i = 0; i < MAX_MB_PLANE; i++)
-    cm->above_context[i] = 0;
-  cm->mip = NULL;
-  cm->prev_mip = NULL;
-  cm->above_seg_context = NULL;
-  cm->last_frame_seg_map = NULL;
-  cm->mi_grid_base = NULL;
-  cm->prev_mi_grid_base = NULL;
+    oci->above_context[i] = 0;
+  oci->mip = NULL;
+  oci->prev_mip = NULL;
+  oci->above_seg_context = NULL;
 }

 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -62,125 +72,112 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
  cm->mb_rows = (aligned_height + 8) >> 4;
  cm->MBs = cm->mb_rows * cm->mb_cols;

-  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
+  cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
  cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
 }

 static void setup_mi(VP9_COMMON *cm) {
  cm->mi = cm->mip + cm->mode_info_stride + 1;
  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;

  vpx_memset(cm->mip, 0,
             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

-  vpx_memset(cm->mi_grid_base, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) *
-             sizeof(*cm->mi_grid_base));
-
  vp9_update_mode_info_border(cm, cm->mip);
+  vp9_update_mode_info_in_image(cm, cm->mi);
+
  vp9_update_mode_info_border(cm, cm->prev_mip);
+  vp9_update_mode_info_in_image(cm, cm->prev_mi);
 }

-int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
  int i, mi_cols;

-  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
-  const int ss_x = cm->subsampling_x;
-  const int ss_y = cm->subsampling_y;
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
+  const int ss_x = oci->subsampling_x;
+  const int ss_y = oci->subsampling_y;
  int mi_size;

-  vp9_free_frame_buffers(cm);
+  vp9_free_frame_buffers(oci);

  for (i = 0; i < NUM_YV12_BUFFERS; i++) {
-    cm->fb_idx_ref_cnt[i] = 0;
-    if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
+    oci->fb_idx_ref_cnt[i] = 0;
+    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,
                               VP9BORDERINPIXELS) < 0)
      goto fail;
  }

-  cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
-  cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
+  oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
+  oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;

  for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
-    cm->active_ref_idx[i] = i;
+    oci->active_ref_idx[i] = i;

  for (i = 0; i < NUM_REF_FRAMES; i++) {
-    cm->ref_frame_map[i] = i;
-    cm->fb_idx_ref_cnt[i] = 1;
+    oci->ref_frame_map[i] = i;
+    oci->fb_idx_ref_cnt[i] = 1;
  }

-  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9BORDERINPIXELS) < 0)
    goto fail;

-  set_mb_mi(cm, aligned_width, aligned_height);
+  set_mb_mi(oci, aligned_width, aligned_height);

  // Allocation
-  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
+  mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);

-  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!cm->mip)
+  oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!oci->mip)
    goto fail;

-  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!cm->prev_mip)
+  oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!oci->prev_mip)
    goto fail;

-  cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
-  if (!cm->mi_grid_base)
-    goto fail;
-
-  cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
-  if (!cm->prev_mi_grid_base)
-    goto fail;
-
-  setup_mi(cm);
+  setup_mi(oci);

  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
  // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);

  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
  // block where mi unit size is 8x8.
-  cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
-                                         (2 * mi_cols), 1);
-  if (!cm->above_context[0])
+# if CONFIG_ALPHA
+  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1);
+#else
+  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1);
+#endif
+  if (!oci->above_context[0])
    goto fail;

-  cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
-  if (!cm->above_seg_context)
-    goto fail;
-
-  // Create the segmentation map structure and set to 0.
-  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
-  if (!cm->last_frame_seg_map)
+  oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
+  if (!oci->above_seg_context)
    goto fail;

  return 0;

 fail:
-  vp9_free_frame_buffers(cm);
+  vp9_free_frame_buffers(oci);
  return 1;
 }

-void vp9_create_common(VP9_COMMON *cm) {
-  vp9_machine_specific_config(cm);
+void vp9_create_common(VP9_COMMON *oci) {
+  vp9_machine_specific_config(oci);

-  vp9_init_mbmode_probs(cm);
+  vp9_init_mbmode_probs(oci);

-  cm->tx_mode = ONLY_4X4;
-  cm->comp_pred_mode = HYBRID_PREDICTION;
+  oci->tx_mode = ONLY_4X4;
+  oci->comp_pred_mode = HYBRID_PREDICTION;

  // Initialize reference frame sign bias structure to defaults
-  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
+  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
 }

-void vp9_remove_common(VP9_COMMON *cm) {
-  vp9_free_frame_buffers(cm);
+void vp9_remove_common(VP9_COMMON *oci) {
+  vp9_free_frame_buffers(oci);
 }

 void vp9_initialize_common() {
@@ -191,8 +188,8 @@ void vp9_initialize_common() {

 void vp9_update_frame_size(VP9_COMMON *cm) {
  int i, mi_cols;
-  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
-  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
+  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);

  set_mb_mi(cm, aligned_width, aligned_height);
  setup_mi(cm);
@@ -201,8 +198,4 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
  for (i = 1; i < MAX_MB_PLANE; i++)
    cm->above_context[i] =
        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
-  // Initialize the previous frame segment map to 0.
-  if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -16,13 +16,14 @@

 void vp9_initialize_common();

-void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);
+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);

-void vp9_create_common(VP9_COMMON *cm);
-void vp9_remove_common(VP9_COMMON *cm);
+void vp9_create_common(VP9_COMMON *oci);
+void vp9_remove_common(VP9_COMMON *oci);

-int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
-void vp9_free_frame_buffers(VP9_COMMON *cm);
+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
+void vp9_free_frame_buffers(VP9_COMMON *oci);


 void vp9_update_frame_size(VP9_COMMON *cm);
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -19,9 +19,9 @@

 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
+#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"

@@ -71,7 +71,7 @@ typedef enum {
  D135_PRED,       // Directional 135 deg = 180 - 45
  D117_PRED,       // Directional 117 deg = 180 - 63
  D153_PRED,       // Directional 153 deg = 180 - 27
-  D207_PRED,       // Directional 207 deg = 180 + 27
+  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
  TM_PRED,         // True-motion
  NEARESTMV,
@@ -89,9 +89,18 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
  return mode >= NEARESTMV && mode <= NEWMV;
 }

-#define INTRA_MODES (TM_PRED + 1)
+#if CONFIG_FILTERINTRA
+static INLINE int is_filter_allowed(MB_PREDICTION_MODE mode) {
+  return mode != DC_PRED &&
+         mode != D45_PRED &&
+         mode != D27_PRED &&
+         mode != D63_PRED;
+}
+#endif

-#define INTER_MODES (1 + NEWMV - NEARESTMV)
+#define VP9_INTRA_MODES (TM_PRED + 1)
+
+#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)

 static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
  return (mode - NEARESTMV);
@@ -115,61 +124,162 @@ typedef enum {
  MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;

-static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
+static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
  return b_width_log2_lookup[sb_type];
 }
-static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
+static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
  return b_height_log2_lookup[sb_type];
 }

-static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
+static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
  return mi_width_log2_lookup[sb_type];
 }

-static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
+static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
  return mi_height_log2_lookup[sb_type];
 }

-// This structure now relates to 8x8 block regions.
+#if CONFIG_INTERINTRA
+static INLINE TX_SIZE intra_size_log2_for_interintra(int bs) {
+  switch (bs) {
+    case 4:
+      return TX_4X4;
+      break;
+    case 8:
+      return TX_8X8;
+      break;
+    case 16:
+      return TX_16X16;
+      break;
+    case 32:
+      return TX_32X32;
+      break;
+    default:
+      return TX_32X32;
+      break;
+  }
+}
+
+static INLINE int is_interintra_allowed(BLOCK_SIZE_TYPE sb_type) {
+  return ((sb_type >= BLOCK_8X8) && (sb_type < BLOCK_64X64));
+}
+
+#if CONFIG_MASKED_INTERINTRA
+#define MASK_BITS_SML_INTERINTRA   3
+#define MASK_BITS_MED_INTERINTRA   4
+#define MASK_BITS_BIG_INTERINTRA   5
+#define MASK_NONE_INTERINTRA      -1
+static INLINE int get_mask_bits_interintra(BLOCK_SIZE_TYPE sb_type) {
+  if (sb_type == BLOCK_4X4)
+     return 0;
+  if (sb_type <= BLOCK_8X8)
+    return MASK_BITS_SML_INTERINTRA;
+  else if (sb_type <= BLOCK_32X32)
+    return MASK_BITS_MED_INTERINTRA;
+  else
+    return MASK_BITS_BIG_INTERINTRA;
+}
+#endif
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+#define MASK_BITS_SML   3
+#define MASK_BITS_MED   4
+#define MASK_BITS_BIG   5
+#define MASK_NONE      -1
+
+static inline int get_mask_bits(BLOCK_SIZE_TYPE sb_type) {
+  if (sb_type == BLOCK_4X4)
+     return 0;
+  if (sb_type <= BLOCK_8X8)
+    return MASK_BITS_SML;
+  else if (sb_type <= BLOCK_32X32)
+    return MASK_BITS_MED;
+  else
+    return MASK_BITS_BIG;
+}
+#endif
+
 typedef struct {
  MB_PREDICTION_MODE mode, uv_mode;
+#if CONFIG_INTERINTRA
+  MB_PREDICTION_MODE interintra_mode, interintra_uv_mode;
+#if CONFIG_MASKED_INTERINTRA
+  int interintra_mask_index;
+  int interintra_uv_mask_index;
+  int use_masked_interintra;
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  int filterbit, uv_filterbit;
+#endif
  MV_REFERENCE_FRAME ref_frame[2];
-  TX_SIZE tx_size;
-  int_mv mv[2];                // for each reference frame used
+  TX_SIZE txfm_size;
+  int_mv mv[2]; // for each reference frame used
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  int_mv best_mv, best_second_mv;

-  uint8_t mode_context[MAX_REF_FRAMES];
+  uint8_t mb_mode_context[MAX_REF_FRAMES];

-  unsigned char skip_coeff;    // 0=need to decode coeffs, 1=no coefficients
-  unsigned char segment_id;    // Segment id for this block.
+  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
+  unsigned char segment_id;           // Segment id for current frame

-  // Flags used for prediction status of various bit-stream signals
+  // Flags used for prediction status of various bistream signals
  unsigned char seg_id_predicted;

+  // Indicates if the mb is part of the image (1) vs border (0)
+  // This can be useful in determining whether the MB provides
+  // a valid predictor
+  unsigned char mb_in_image;
+
  INTERPOLATIONFILTERTYPE interp_filter;

-  BLOCK_SIZE sb_type;
+  BLOCK_SIZE_TYPE sb_type;
+
+#if CONFIG_MASKED_INTERINTER
+  int use_masked_compound;
+  int mask_index;
+#endif
 } MB_MODE_INFO;

 typedef struct {
  MB_MODE_INFO mbmi;
+#if CONFIG_FILTERINTRA
+  int b_filter_info[4];
+#endif
  union b_mode_info bmi[4];
 } MODE_INFO;

-static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+static int is_inter_block(const MB_MODE_INFO *mbmi) {
  return mbmi->ref_frame[0] > INTRA_FRAME;
 }

-static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
-  return mbmi->ref_frame[1] > INTRA_FRAME;
-}

 enum mv_precision {
  MV_PRECISION_Q3,
  MV_PRECISION_Q4
 };

+#define VP9_REF_SCALE_SHIFT 14
+#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
+
+struct scale_factors {
+  int x_scale_fp;   // horizontal fixed point scale factor
+  int y_scale_fp;   // vertical fixed point scale factor
+  int x_offset_q4;
+  int x_step_q4;
+  int y_offset_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *scale);
+  int (*scale_value_y)(int val, const struct scale_factors *scale);
+  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
+  MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
+  MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
+
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+};
+
 #if CONFIG_ALPHA
 enum { MAX_MB_PLANE = 4 };
 #else
@@ -195,27 +305,45 @@ struct macroblockd_plane {
  ENTROPY_CONTEXT *left_context;
 };

-#define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
+#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
+
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      2
+
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_LF_DELTAS];
+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};

 typedef struct macroblockd {
  struct macroblockd_plane plane[MAX_MB_PLANE];

  struct scale_factors scale_factor[2];

-  MODE_INFO *last_mi;
-  MODE_INFO *this_mi;
+  MODE_INFO *prev_mode_info_context;
+  MODE_INFO *mode_info_context;
  int mode_info_stride;

-  MODE_INFO *mic_stream_ptr;
-
-  // A NULL indicates that the 8x8 is not part of the image
-  MODE_INFO **mi_8x8;
-  MODE_INFO **prev_mi_8x8;
-
  int up_available;
  int left_available;
  int right_available;

+  struct segmentation seg;
+  struct loopfilter lf;
+
  // partition contexts
  PARTITION_CONTEXT *above_seg_context;
  PARTITION_CONTEXT *left_seg_context;
@@ -247,7 +375,7 @@ typedef struct macroblockd {

 } MACROBLOCKD;

-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
  switch (subsize) {
    case BLOCK_64X64:
    case BLOCK_64X32:
@@ -272,8 +400,9 @@ static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
  }
 }

-static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
-                                            BLOCK_SIZE sb_size) {
+static INLINE void update_partition_context(MACROBLOCKD *xd,
+                                            BLOCK_SIZE_TYPE sb_type,
+                                            BLOCK_SIZE_TYPE sb_size) {
  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
  const int bwl = b_width_log2(sb_type);
  const int bhl = b_height_log2(sb_type);
@@ -291,7 +420,8 @@ static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }

-static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
+static INLINE int partition_plane_context(MACROBLOCKD *xd,
+                                          BLOCK_SIZE_TYPE sb_type) {
  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
  int above = 0, left = 0, i;
  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
@@ -311,9 +441,10 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }

-static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
-  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
-  assert(subsize < BLOCK_SIZES);
+static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
+                                   PARTITION_TYPE partition) {
+  BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
+  assert(subsize != BLOCK_SIZE_TYPES);
  return subsize;
 }

@@ -321,7 +452,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];

 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->this_mi;
+  const MODE_INFO *const mi = xd->mode_info_context;
  const MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (plane_type != PLANE_TYPE_Y_WITH_DC ||
@@ -336,13 +467,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd) {
  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
 }

 static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
                                        const MACROBLOCKD *xd) {
  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
 }

 static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
@@ -362,147 +493,296 @@ static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {


 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
+  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
 }

-static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
-                                       const struct macroblockd_plane *pd) {
-  BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
-  assert(bs < BLOCK_SIZES);
-  return bs;
+struct plane_block_idx {
+  int plane;
+  int block;
+};
+
+// TODO(jkoleszar): returning a struct so it can be used in a const context,
+// expect to refactor this further later.
+static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
+                                                     int b_idx) {
+  const int v_offset = y_blocks * 5 / 4;
+  struct plane_block_idx res;
+
+  if (b_idx < y_blocks) {
+    res.plane = 0;
+    res.block = b_idx;
+  } else if (b_idx < v_offset) {
+    res.plane = 1;
+    res.block = b_idx - y_blocks;
+  } else {
+    assert(b_idx < y_blocks * 3 / 2);
+    res.plane = 2;
+    res.block = b_idx - v_offset;
+  }
+  return res;
 }

-static INLINE int plane_block_width(BLOCK_SIZE bsize,
+static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,
                                    const struct macroblockd_plane* plane) {
  return 4 << (b_width_log2(bsize) - plane->subsampling_x);
 }

-static INLINE int plane_block_height(BLOCK_SIZE bsize,
+static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
                                     const struct macroblockd_plane* plane) {
  return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }

+static INLINE int plane_block_width_log2by4(
+    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
+  return (b_width_log2(bsize) - plane->subsampling_x);
+}
+
+static INLINE int plane_block_height_log2by4(
+    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
+  return (b_height_log2(bsize) - plane->subsampling_y);
+}
+
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
-                                                  BLOCK_SIZE plane_bsize,
-                                                  TX_SIZE tx_size,
+                                                  BLOCK_SIZE_TYPE bsize,
+                                                  int ss_txfrm_size,
                                                  void *arg);

 static INLINE void foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
    foreach_transformed_block_visitor visit, void *arg) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi;
+  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+
  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
  // transform size varies per plane, look it up in a common way.
+  const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
-                                : mbmi->tx_size;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int step = 1 << (tx_size << 1);
+                                : mbmi->txfm_size;
+  const int block_size_b = bw + bh;
+  const int txfrm_size_b = tx_size * 2;
+
+  // subsampled size of the block
+  const int ss_sum = xd->plane[plane].subsampling_x
+      + xd->plane[plane].subsampling_y;
+  const int ss_block_size = block_size_b - ss_sum;
+
+  const int step = 1 << txfrm_size_b;
+
  int i;

+  assert(txfrm_size_b <= block_size_b);
+  assert(txfrm_size_b <= ss_block_size);
+
  // If mb_to_right_edge is < 0 we are in a situation in which
  // the current block size extends into the UMV and we won't
  // visit the sub blocks that are wholly within the UMV.
  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
    int r, c;
-
-    int max_blocks_wide = num_4x4_w;
-    int max_blocks_high = num_4x4_h;
+    const int sw = bw - xd->plane[plane].subsampling_x;
+    const int sh = bh - xd->plane[plane].subsampling_y;
+    int max_blocks_wide = 1 << sw;
+    int max_blocks_high = 1 << sh;

    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
    // it to 4x4 block sizes.
    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      max_blocks_wide +=
+          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));

    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+      max_blocks_high +=
+          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));

    i = 0;
    // Unlike the normal case - in here we have to keep track of the
    // row and column of the blocks we use so that we know if we are in
    // the unrestricted motion border.
-    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
-      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
+      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, plane_bsize, tx_size, arg);
+          visit(plane, i, bsize, txfrm_size_b, arg);
        i += step;
      }
    }
  } else {
-    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
-      visit(plane, i, plane_bsize, tx_size, arg);
+    for (i = 0; i < (1 << ss_block_size); i += step) {
+      visit(plane, i, bsize, txfrm_size_b, arg);
+    }
  }
 }

 static INLINE void foreach_transformed_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
    foreach_transformed_block_visitor visit, void *arg) {
  int plane;

-  for (plane = 0; plane < MAX_MB_PLANE; plane++)
-    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    foreach_transformed_block_in_plane(xd, bsize, plane,
+                                       visit, arg);
+  }
 }

 static INLINE void foreach_transformed_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
    foreach_transformed_block_visitor visit, void *arg) {
  int plane;

-  for (plane = 1; plane < MAX_MB_PLANE; plane++)
-    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    foreach_transformed_block_in_plane(xd, bsize, plane,
+                                       visit, arg);
+  }
 }

-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+typedef void (*foreach_predicted_block_visitor)(int plane, int block,
+                                                BLOCK_SIZE_TYPE bsize,
+                                                int pred_w, int pred_h,
+                                                void *arg);
+static INLINE void foreach_predicted_block_in_plane(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int i, x, y;
+
+  // block sizes in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // subsampled size of the block
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+  // size of the predictor to use.
+  int pred_w, pred_h;
+
+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
+    assert(bsize == BLOCK_8X8);
+    pred_w = 0;
+    pred_h = 0;
+  } else {
+    pred_w = bwl;
+    pred_h = bhl;
+  }
+  assert(pred_w <= bwl);
+  assert(pred_h <= bhl);
+
+  // visit each subblock in raster order
+  i = 0;
+  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
+    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
+      visit(plane, i, bsize, pred_w, pred_h, arg);
+      i += 1 << pred_w;
+    }
+    i += (1 << (bwl + pred_h)) - (1 << bwl);
+  }
+}
+static INLINE void foreach_predicted_block(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
+  }
+}
+static INLINE void foreach_predicted_block_uv(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int plane;
+
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
+  }
+}
+static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                               int plane, int block, int stride) {
+  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));
  return y * stride + x;
 }
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 << b_width_log2(plane_bsize);
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
+static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,
+                                         BLOCK_SIZE_TYPE bsize,
+                                         int plane, int block, int16_t *base) {
+  const int stride = plane_block_width(bsize, &xd->plane[plane]);
+  return base + raster_block_offset(xd, bsize, plane, block, stride);
 }
-static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
-                                          int raster_block, uint8_t *base,
-                                          int stride) {
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
+static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd,
+                                         BLOCK_SIZE_TYPE bsize,
+                                         int plane, int block,
+                                         uint8_t *base, int stride) {
+  return base + raster_block_offset(xd, bsize, plane, block, stride);
 }

-static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
-                                       TX_SIZE tx_size, int block) {
-  const int bwl = b_width_log2(plane_bsize);
-  const int tx_cols_log2 = bwl - tx_size;
+static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
+                                       BLOCK_SIZE_TYPE bsize,
+                                       int plane, int block,
+                                       int ss_txfrm_size) {
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int txwl = ss_txfrm_size / 2;
+  const int tx_cols_log2 = bwl - txwl;
  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  const int x = (raster_mb & (tx_cols - 1)) << tx_size;
-  const int y = (raster_mb >> tx_cols_log2) << tx_size;
+  const int raster_mb = block >> ss_txfrm_size;
+  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
+  const int y = raster_mb >> tx_cols_log2 << (txwl);
  return x + (y << bwl);
 }

-static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
-                                     TX_SIZE tx_size, int block,
+static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
+                                     BLOCK_SIZE_TYPE bsize,
+                                     int plane, int block,
+                                     int ss_txfrm_size,
                                     int *x, int *y) {
-  const int bwl = b_width_log2(plane_bsize);
-  const int tx_cols_log2 = bwl - tx_size;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int txwl = ss_txfrm_size / 2;
+  const int tx_cols_log2 = bwl - txwl;
  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  *x = (raster_mb & (tx_cols - 1)) << tx_size;
-  *y = (raster_mb >> tx_cols_log2) << tx_size;
+  const int raster_mb = block >> ss_txfrm_size;
+  *x = (raster_mb & (tx_cols - 1)) << (txwl);
+  *y = raster_mb >> tx_cols_log2 << (txwl);
 }

-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
-                             int plane, int block, TX_SIZE tx_size) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  uint8_t *const buf = pd->dst.buf;
-  const int stride = pd->dst.stride;
+#if CONFIG_INTERINTRA
+static void extend_for_interintra(MACROBLOCKD* const xd,
+                                  BLOCK_SIZE_TYPE bsize) {
+  int bh = 4 << b_height_log2(bsize), bw = 4 << b_width_log2(bsize);
+  int ystride = xd->plane[0].dst.stride, uvstride = xd->plane[1].dst.stride;
+  uint8_t *pixel_y, *pixel_u, *pixel_v;
+  int ymargin, uvmargin;
+  if (xd->mb_to_bottom_edge < 0) {
+    int r;
+    ymargin = 0 - xd->mb_to_bottom_edge / 8;
+    uvmargin = 0 - xd->mb_to_bottom_edge / 16;
+    pixel_y = xd->plane[0].dst.buf - 1 + (bh - ymargin -1) * ystride;
+    pixel_u = xd->plane[1].dst.buf - 1 + (bh / 2 - uvmargin - 1) * uvstride;
+    pixel_v = xd->plane[2].dst.buf - 1 + (bh / 2 - uvmargin - 1) * uvstride;
+    for (r = 0; r < ymargin; r++)
+      xd->plane[0].dst.buf[-1 + (bh - r -1) * ystride] = *pixel_y;
+    for (r = 0; r < uvmargin; r++) {
+      xd->plane[1].dst.buf[-1 + (bh / 2 - r -1) * uvstride] = *pixel_u;
+      xd->plane[2].dst.buf[-1 + (bh / 2 - r -1) * uvstride] = *pixel_v;
+    }
+  }
+  if (xd->mb_to_right_edge < 0) {
+    ymargin = 0 - xd->mb_to_right_edge / 8;
+    uvmargin = 0 - xd->mb_to_right_edge / 16;
+    pixel_y = xd->plane[0].dst.buf + bw - ymargin - 1 - ystride;
+    pixel_u = xd->plane[1].dst.buf + bw / 2 - uvmargin - 1 - uvstride;
+    pixel_v = xd->plane[2].dst.buf + bw / 2 - uvmargin - 1 - uvstride;
+    vpx_memset(xd->plane[0].dst.buf + bw - ymargin - ystride,
+               *pixel_y, ymargin);
+    vpx_memset(xd->plane[1].dst.buf + bw / 2 - uvmargin - uvstride,
+               *pixel_u, uvmargin);
+    vpx_memset(xd->plane[2].dst.buf + bw / 2 - uvmargin - uvstride,
+               *pixel_v, uvmargin);
+  }
+}
+#endif

+static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
+                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {
+  const int bw = plane_block_width(bsize, &xd->plane[plane]);
+  const int bh = plane_block_height(bsize, &xd->plane[plane]);
  int x, y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
  x = x * 4 - 1;
  y = y * 4 - 1;
  // Copy a pixel into the umv if we are in a situation where the block size
@@ -510,38 +790,41 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
  // TODO(JBB): Should be able to do the full extend in place so we don't have
  // to do this multiple times.
  if (xd->mb_to_right_edge < 0) {
-    const int bw = 4 << b_width_log2(plane_bsize);
-    const int umv_border_start = bw + (xd->mb_to_right_edge >>
-                                       (3 + pd->subsampling_x));
+    int umv_border_start = bw
+        + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x));

    if (x + bw > umv_border_start)
-      vpx_memset(&buf[y * stride + umv_border_start],
-                 buf[y * stride + umv_border_start - 1], bw);
+      vpx_memset(
+          xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
+              + umv_border_start,
+          *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
+              + umv_border_start - 1),
+          bw);
  }
-
  if (xd->mb_to_bottom_edge < 0) {
-    const int bh = 4 << b_height_log2(plane_bsize);
-    const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
-                                       (3 + pd->subsampling_y));
+    int umv_border_start = bh
+        + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y));
    int i;
-    const uint8_t c = buf[(umv_border_start - 1) * stride + x];
-    uint8_t *d = &buf[umv_border_start * stride + x];
+    uint8_t c = *(xd->plane[plane].dst.buf
+        + (umv_border_start - 1) * xd->plane[plane].dst.stride + x);
+
+    uint8_t *d = xd->plane[plane].dst.buf
+        + umv_border_start * xd->plane[plane].dst.stride + x;

    if (y + bh > umv_border_start)
-      for (i = 0; i < bh; ++i, d += stride)
+      for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride)
        *d = c;
  }
 }
-static void set_contexts_on_border(MACROBLOCKD *xd,
-                                   struct macroblockd_plane *pd,
-                                   BLOCK_SIZE plane_bsize,
-                                   int tx_size_in_blocks, int has_eob,
-                                   int aoff, int loff,
+static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                                   int plane, int tx_size_in_blocks,
+                                   int eob, int aoff, int loff,
                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-  int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  struct macroblockd_plane *pd = &xd->plane[plane];
  int above_contexts = tx_size_in_blocks;
  int left_contexts = tx_size_in_blocks;
+  int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
+  int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
  int pt;

  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
@@ -549,47 +832,26 @@ static void set_contexts_on_border(MACROBLOCKD *xd,
  if (xd->mb_to_right_edge < 0)
    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));

-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
  // this code attempts to avoid copying into contexts that are outside
  // our border.  Any blocks that do are set to 0...
  if (above_contexts + aoff > mi_blocks_wide)
    above_contexts = mi_blocks_wide - aoff;

+  if (xd->mb_to_bottom_edge < 0)
+    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
  if (left_contexts + loff > mi_blocks_high)
    left_contexts = mi_blocks_high - loff;

  for (pt = 0; pt < above_contexts; pt++)
-    A[pt] = has_eob;
+    A[pt] = eob > 0;
  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
    A[pt] = 0;
  for (pt = 0; pt < left_contexts; pt++)
-    L[pt] = has_eob;
+    L[pt] = eob > 0;
  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
    L[pt] = 0;
 }

-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                         int has_eob, int aoff, int loff) {
-  ENTROPY_CONTEXT *const A = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const L = pd->left_context + loff;
-  const int tx_size_in_blocks = 1 << tx_size;
-
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
-                           aoff, loff, A, L);
-  } else {
-    vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-    vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-  }
-}
-
-static int get_tx_eob(struct segmentation *seg, int segment_id,
-                      TX_SIZE tx_size) {
-  const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}

 #endif  // VP9_COMMON_VP9_BLOCKD_H_
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -13,33 +13,33 @@
 #include "vp9/common/vp9_common_data.h"

 // Log 2 conversion lookup tables for block width and height
-const int b_width_log2_lookup[BLOCK_SIZES] =
+const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
-const int b_height_log2_lookup[BLOCK_SIZES] =
+const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
+const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-const int num_4x4_blocks_high_lookup[BLOCK_SIZES] =
+const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
 // Log 2 conversion lookup tables for modeinfo width and height
-const int mi_width_log2_lookup[BLOCK_SIZES] =
+const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
+const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZES] =
+const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
-const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
+const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};

 // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
-const int size_group_lookup[BLOCK_SIZES] =
+const int size_group_lookup[BLOCK_SIZE_TYPES] =
  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};

-const int num_pels_log2_lookup[BLOCK_SIZES] =
+const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};


-const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
  {  // 4X4
    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
@@ -74,62 +74,51 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
  }
 };

-const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
+const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
  {     // PARTITION_NONE
-    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
-    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+    BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
    BLOCK_64X64,
  }, {  // PARTITION_HORZ
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
    BLOCK_64X32,
  }, {  // PARTITION_VERT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
    BLOCK_32X64,
  }, {  // PARTITION_SPLIT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
    BLOCK_32X32,
  }
 };

-const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_8X8,   TX_8X8,   TX_8X8,
+const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = {
+  TX_4X4, TX_4X4, TX_4X4,
+  TX_8X8, TX_8X8, TX_8X8,
  TX_16X16, TX_16X16, TX_16X16,
  TX_32X32, TX_32X32, TX_32X32, TX_32X32
 };
-const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_8X8,   TX_8X8,   TX_8X8,
+const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
+  TX_4X4, TX_4X4, TX_4X4,
+  TX_4X4, TX_4X4, TX_4X4,
+  TX_8X8, TX_8X8, TX_8X8,
  TX_16X16, TX_16X16, TX_16X16, TX_32X32
 };

-const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
-//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
-//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-  {{BLOCK_4X4,   BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},
-  {{BLOCK_4X8,   BLOCK_4X4},     {BLOCK_INVALID, BLOCK_INVALID}},
-  {{BLOCK_8X4,   BLOCK_INVALID}, {BLOCK_4X4,     BLOCK_INVALID}},
-  {{BLOCK_8X8,   BLOCK_8X4},     {BLOCK_4X8,     BLOCK_4X4}},
-  {{BLOCK_8X16,  BLOCK_8X8},     {BLOCK_INVALID, BLOCK_4X8}},
-  {{BLOCK_16X8,  BLOCK_INVALID}, {BLOCK_8X8,     BLOCK_8X4}},
-  {{BLOCK_16X16, BLOCK_16X8},    {BLOCK_8X16,    BLOCK_8X8}},
-  {{BLOCK_16X32, BLOCK_16X16},   {BLOCK_INVALID, BLOCK_8X16}},
-  {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16,   BLOCK_16X8}},
-  {{BLOCK_32X32, BLOCK_32X16},   {BLOCK_16X32,   BLOCK_16X16}},
-  {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
-  {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
-  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
+const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
+  { BLOCK_4X4,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8 },
+  { BLOCK_8X4,   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X16,  BLOCK_8X16 },
+  { BLOCK_16X8,  BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
+  { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
+  { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
 };
-
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -13,20 +13,20 @@

 #include "vp9/common/vp9_enums.h"

-extern const int b_width_log2_lookup[BLOCK_SIZES];
-extern const int b_height_log2_lookup[BLOCK_SIZES];
-extern const int mi_width_log2_lookup[BLOCK_SIZES];
-extern const int mi_height_log2_lookup[BLOCK_SIZES];
-extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
-extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
-extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
-extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES];
-extern const int size_group_lookup[BLOCK_SIZES];
-extern const int num_pels_log2_lookup[BLOCK_SIZES];
-extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
-extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
-extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
-extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
-extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const int size_group_lookup[BLOCK_SIZE_TYPES];
+extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
+extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
+extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
+extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
+extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5];

 #endif    // VP9_COMMON_VP9_COMMON_DATA_H
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -14,45 +14,66 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

+#define VP9_FILTER_WEIGHT 128
+#define VP9_FILTER_SHIFT  7
+
+/* Assume a bank of 16 filters to choose from. There are two implementations
+ * for filter wrapping behavior, since we want to be able to pick which filter
+ * to start with. We could either:
+ *
+ * 1) make filter_ a pointer to the base of the filter array, and then add an
+ *    additional offset parameter, to choose the starting filter.
+ * 2) use a pointer to 2 periods worth of filters, so that even if the original
+ *    phase offset is at 15/16, we'll have valid data to read. The filter
+ *    tables become [32][8], and the second half is duplicated.
+ * 3) fix the alignment of the filter tables, so that we know the 0/16 is
+ *    always 256 byte aligned.
+ *
+ * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
+ * parameters, and switching between them is trivial, with the
+ * ALIGN_FILTERS_256 macro, below.
+ */
+ #define ALIGN_FILTERS_256 1
+
 static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x0, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h, int taps) {
-  int x, y, k;
+  int x, y, k, sum;
+  const int16_t *filter_x_base = filter_x0;

-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_x_base =
-      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#if ALIGN_FILTERS_256
+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#endif

  /* Adjust base pointer address for this source line */
  src -= taps / 2 - 1;

  for (y = 0; y < h; ++y) {
+    /* Pointer to filter to use */
+    const int16_t *filter_x = filter_x0;
+
    /* Initial phase offset */
-    int x_q4 = (filter_x0 - filter_x_base) / taps;
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;

    for (x = 0; x < w; ++x) {
      /* Per-pixel src offset */
-      const int src_x = x_q4 >> SUBPEL_BITS;
-      int sum = 0;
+      int src_x = (x_q4 - x0_q4) >> 4;

-      /* Pointer to filter to use */
-      const int16_t *const filter_x = filter_x_base +
-          (x_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
+      for (sum = 0, k = 0; k < taps; ++k) {
        sum += src[src_x + k] * filter_x[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);

-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-
-      /* Move to the next source pixel */
+      /* Adjust source and filter to use for the next pixel */
      x_q4 += x_step_q4;
+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
    }
    src += src_stride;
    dst += dst_stride;
@@ -64,37 +85,37 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_x0, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int taps) {
-  int x, y, k;
+  int x, y, k, sum;
+  const int16_t *filter_x_base = filter_x0;

-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_x_base =
-      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#if ALIGN_FILTERS_256
+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#endif

  /* Adjust base pointer address for this source line */
  src -= taps / 2 - 1;

  for (y = 0; y < h; ++y) {
+    /* Pointer to filter to use */
+    const int16_t *filter_x = filter_x0;
+
    /* Initial phase offset */
-    int x_q4 = (filter_x0 - filter_x_base) / taps;
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;

    for (x = 0; x < w; ++x) {
      /* Per-pixel src offset */
-      const int src_x = x_q4 >> SUBPEL_BITS;
-      int sum = 0;
+      int src_x = (x_q4 - x0_q4) >> 4;

-      /* Pointer to filter to use */
-      const int16_t *const filter_x = filter_x_base +
-          (x_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
+      for (sum = 0, k = 0; k < taps; ++k) {
        sum += src[src_x + k] * filter_x[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-                   clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-
-      /* Move to the next source pixel */
+      /* Adjust source and filter to use for the next pixel */
      x_q4 += x_step_q4;
+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
    }
    src += src_stride;
    dst += dst_stride;
@@ -106,37 +127,37 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y0, int y_step_q4,
                            int w, int h, int taps) {
-  int x, y, k;
+  int x, y, k, sum;

-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_y_base =
-      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+  const int16_t *filter_y_base = filter_y0;
+
+#if ALIGN_FILTERS_256
+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+#endif

  /* Adjust base pointer address for this source column */
  src -= src_stride * (taps / 2 - 1);
-
  for (x = 0; x < w; ++x) {
+    /* Pointer to filter to use */
+    const int16_t *filter_y = filter_y0;
+
    /* Initial phase offset */
-    int y_q4 = (filter_y0 - filter_y_base) / taps;
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;

    for (y = 0; y < h; ++y) {
      /* Per-pixel src offset */
-      const int src_y = y_q4 >> SUBPEL_BITS;
-      int sum = 0;
+      int src_y = (y_q4 - y0_q4) >> 4;

-      /* Pointer to filter to use */
-      const int16_t *const filter_y = filter_y_base +
-          (y_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
+      for (sum = 0, k = 0; k < taps; ++k) {
        sum += src[(src_y + k) * src_stride] * filter_y[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);

-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-
-      /* Move to the next source pixel */
+      /* Adjust source and filter to use for the next pixel */
      y_q4 += y_step_q4;
+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
    }
    ++src;
    ++dst;
@@ -148,37 +169,38 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y0, int y_step_q4,
                                int w, int h, int taps) {
-  int x, y, k;
+  int x, y, k, sum;

-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_y_base =
-      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+  const int16_t *filter_y_base = filter_y0;
+
+#if ALIGN_FILTERS_256
+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+#endif

  /* Adjust base pointer address for this source column */
  src -= src_stride * (taps / 2 - 1);
-
  for (x = 0; x < w; ++x) {
+    /* Pointer to filter to use */
+    const int16_t *filter_y = filter_y0;
+
    /* Initial phase offset */
-    int y_q4 = (filter_y0 - filter_y_base) / taps;
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;

    for (y = 0; y < h; ++y) {
      /* Per-pixel src offset */
-      const int src_y = y_q4 >> SUBPEL_BITS;
-      int sum = 0;
+      int src_y = (y_q4 - y0_q4) >> 4;

-      /* Pointer to filter to use */
-      const int16_t *const filter_y = filter_y_base +
-          (y_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
+      for (sum = 0, k = 0; k < taps; ++k) {
        sum += src[(src_y + k) * src_stride] * filter_y[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[y * dst_stride] =
+          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-
-      /* Move to the next source pixel */
+      /* Adjust source and filter to use for the next pixel */
      y_q4 += y_step_q4;
+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
    }
    ++src;
    ++dst;
@@ -191,27 +213,58 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h, int taps) {
  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 324, for y_step_q4 == 80,
+   * Maximum intermediate_height is 135, for y_step_q4 == 32,
   * h == 64, taps == 8.
-   * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
   */
-  uint8_t temp[64 * 324];
-  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;
+  uint8_t temp[64 * 135];
+  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;

  assert(w <= 64);
  assert(h <= 64);
  assert(taps <= 8);
-  assert(y_step_q4 <= 80);
-  assert(x_step_q4 <= 80);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);

  if (intermediate_height < h)
    intermediate_height = h;

-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4, w,
-                   intermediate_height, taps);
-  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x,
-                  x_step_q4, filter_y, y_step_q4, w, h, taps);
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
+                   temp, 64,
+                   filter_x, x_step_q4, filter_y, y_step_q4,
+                   w, intermediate_height, taps);
+  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
+                  filter_x, x_step_q4, filter_y, y_step_q4,
+                  w, h, taps);
+}
+
+static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h, int taps) {
+  /* Fixed size intermediate buffer places limits on parameters.
+   * Maximum intermediate_height is 135, for y_step_q4 == 32,
+   * h == 64, taps == 8.
+   */
+  uint8_t temp[64 * 135];
+  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(taps <= 8);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
+                   temp, 64,
+                   filter_x, x_step_q4, filter_y, y_step_q4,
+                   w, intermediate_height, taps);
+  convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h, taps);
 }

 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -220,7 +273,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
  convolve_horiz_c(src, src_stride, dst, dst_stride,
-                   filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+                   filter_x, x_step_q4, filter_y, y_step_q4,
+                   w, h, 8);
 }

 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -229,7 +283,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+                       filter_x, x_step_q4, filter_y, y_step_q4,
+                       w, h, 8);
 }

 void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -238,7 +293,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
  convolve_vert_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+                  filter_x, x_step_q4, filter_y, y_step_q4,
+                  w, h, 8);
 }

 void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -247,7 +303,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
  convolve_avg_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h, 8);
 }

 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -256,7 +313,8 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
  convolve_c(src, src_stride, dst, dst_stride,
-             filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+             filter_x, x_step_q4, filter_y, y_step_q4,
+             w, h, 8);
 }

 void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -269,9 +327,16 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  assert(w <= 64);
  assert(h <= 64);

-  vp9_convolve8(src, src_stride, temp, 64,
-               filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vp9_convolve8(src, src_stride,
+                temp, 64,
+                filter_x, x_step_q4,
+                filter_y, y_step_q4,
+                w, h);
+  vp9_convolve_avg(temp, 64,
+                   dst, dst_stride,
+                   NULL, 0, /* These unused parameter should be removed! */
+                   NULL, 0, /* These unused parameter should be removed! */
+                   w, h);
 }

 void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -296,9 +361,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  int x, y;

  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x)
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
+    for (x = 0; x < w; ++x) {
+      dst[x] = (dst[x] + src[x] + 1) >> 1;
+    }
    src += src_stride;
    dst += dst_stride;
  }
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -13,8 +13,6 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"

-#define FILTER_BITS 7
-
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -22,24 +22,23 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
 * and uses the passed in member offset to print out the value of an integer
 * for each mbmi member value in the mi structure.
 */
-static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
+static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
                          size_t member_offset) {
  int mi_row;
  int mi_col;
  int mi_index = 0;
-  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
-  int rows = cm->mi_rows;
-  int cols = cm->mi_cols;
+  MODE_INFO *mi = common->mi;
+  int rows = common->mi_rows;
+  int cols = common->mi_cols;
  char prefix = descriptor[0];

-  log_frame_info(cm, descriptor, file);
+  log_frame_info(common, descriptor, file);
  mi_index = 0;
  for (mi_row = 0; mi_row < rows; mi_row++) {
    fprintf(file, "%c ", prefix);
    for (mi_col = 0; mi_col < cols; mi_col++) {
      fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) +
-                        member_offset)));
+              *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
      mi_index++;
    }
    fprintf(file, "\n");
@@ -52,23 +51,23 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
  int mi_col;
  int mi_index = 0;
  FILE *mvs = fopen(file, "a");
-  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+  MODE_INFO *mi = cm->mi;
  int rows = cm->mi_rows;
  int cols = cm->mi_cols;

  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
-  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));

  log_frame_info(cm, "Vectors ",mvs);
  for (mi_row = 0; mi_row < rows; mi_row++) {
    fprintf(mvs,"V ");
    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
-                               mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
+              mi[mi_index].mbmi.mv[0].as_mv.col);
      mi_index++;
    }
    fprintf(mvs, "\n");
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -377,7 +377,7 @@ static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {

 static void extend_model_to_full_distribution(vp9_prob p,
                                              vp9_prob *tree_probs) {
-  const int l = (p - 1) / 2;
+  const int l = ((p - 1) / 2);
  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
  if (p & 1) {
    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
@@ -436,11 +436,11 @@ const vp9_extra_bit vp9_extra_bits[12] = {

 #include "vp9/common/vp9_default_coef_probs.h"

-void vp9_default_coef_probs(VP9_COMMON *cm) {
-  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+void vp9_default_coef_probs(VP9_COMMON *pc) {
+  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }

 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -622,6 +622,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
  int t, i, j, k, l;
  unsigned int branch_ct[UNCONSTRAINED_NODES][2];
  vp9_prob coef_probs[UNCONSTRAINED_NODES];
+  int entropy_nodes_adapt = UNCONSTRAINED_NODES;

  for (i = 0; i < BLOCK_TYPES; ++i)
    for (j = 0; j < REF_TYPES; ++j)
@@ -634,7 +635,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                                           0);
          branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
+          for (t = 0; t < entropy_nodes_adapt; ++t)
            dst_coef_probs[i][j][k][l][t] = merge_probs(
                pre_coef_probs[i][j][k][l][t], coef_probs[t],
                branch_ct[t], count_sat, update_factor);
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -95,7 +95,7 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 #define MODULUS_PARAM               13  /* Modulus parameter */

 struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *cm);
+void vp9_default_coef_probs(struct VP9Common *);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);

 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
@@ -154,17 +154,19 @@ extern DECLARE_ALIGNED(16, int16_t,
                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);

 void vp9_coef_tree_initialize(void);
-void vp9_adapt_coef_probs(struct VP9Common *cm);
+void vp9_adapt_coef_probs(struct VP9Common *);

-static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd,
+                                               BLOCK_SIZE_TYPE bsize) {
+  /* Clear entropy contexts */
+  const int bw = 1 << b_width_log2(bsize);
+  const int bh = 1 << b_height_log2(bsize);
  int i;
  for (i = 0; i < MAX_MB_PLANE; i++) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
-                   num_4x4_blocks_wide_lookup[plane_bsize]);
-    vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
-                   num_4x4_blocks_high_lookup[plane_bsize]);
+    vpx_memset(xd->plane[i].above_context, 0,
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x);
+    vpx_memset(xd->plane[i].left_context, 0,
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y);
  }
 }

@@ -336,45 +338,6 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
  }
 }

-static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                               PLANE_TYPE type, int block_idx,
-                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
-                               const int16_t **scan,
-                               const uint8_t **band_translate) {
-  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
-
-  switch (tx_size) {
-    case TX_4X4:
-      *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
-      *band_translate = vp9_coefband_trans_4x4;
-      above_ec = A[0] != 0;
-      left_ec = L[0] != 0;
-      break;
-    case TX_8X8:
-      *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      *band_translate = vp9_coefband_trans_8x8plus;
-      above_ec = !!*(uint16_t *)A;
-      left_ec  = !!*(uint16_t *)L;
-      break;
-    case TX_16X16:
-      *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      *band_translate = vp9_coefband_trans_8x8plus;
-      above_ec = !!*(uint32_t *)A;
-      left_ec  = !!*(uint32_t *)L;
-      break;
-    case TX_32X32:
-      *scan = vp9_default_scan_32x32;
-      *band_translate = vp9_coefband_trans_8x8plus;
-      above_ec = !!*(uint64_t *)A;
-      left_ec  = !!*(uint64_t *)L;
-      break;
-    default:
-      assert(!"Invalid transform size.");
-  }
-
-  return combine_entropy_contexts(above_ec, left_ec);
-}
-
 enum { VP9_COEF_UPDATE_PROB = 252 };

 #endif  // VP9_COMMON_VP9_ENTROPY_H_
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -14,8 +14,8 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"

-const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES]
-                                  [INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
+                                  [VP9_INTRA_MODES - 1] = {
  { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
  { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
  { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
@@ -23,21 +23,21 @@ const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES]
  { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
  { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
  { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
-  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d207 */,
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
  { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
  { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
 };

 static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]
-                                        [INTRA_MODES - 1] = {
+                                        [VP9_INTRA_MODES - 1] = {
  {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* block_size < 8x8 */,
  { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* block_size < 16x16 */,
  { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* block_size < 32x32 */,
  { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* block_size >= 32x32 */
 };

-static const vp9_prob default_if_uv_probs[INTRA_MODES]
-                                         [INTRA_MODES - 1] = {
+static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES - 1] = {
  { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
  {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
  {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
@@ -45,7 +45,7 @@ static const vp9_prob default_if_uv_probs[INTRA_MODES]
  {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
  {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
  {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
-  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d207 */,
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
  {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
  { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
 };
@@ -98,9 +98,9 @@ static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
  }
 };

-const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
-                                 [INTRA_MODES]
-                                 [INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
+                                 [VP9_INTRA_MODES]
+                                 [VP9_INTRA_MODES - 1] = {
  { /* above = dc */
    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
@@ -109,7 +109,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d207 */,
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
  }, { /* above = v */
@@ -120,7 +120,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d207 */,
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
  }, { /* above = h */
@@ -131,7 +131,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d207 */,
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
  }, { /* above = d45 */
@@ -142,7 +142,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d207 */,
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
  }, { /* above = d135 */
@@ -153,7 +153,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d207 */,
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
  }, { /* above = d117 */
@@ -164,7 +164,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d207 */,
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
  }, { /* above = d153 */
@@ -175,10 +175,10 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d207 */,
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-  }, { /* above = d207 */
+  }, { /* above = d27 */
    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
@@ -186,7 +186,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d207 */,
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
  }, { /* above = d63 */
@@ -197,7 +197,7 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d207 */,
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
  }, { /* above = tm */
@@ -208,14 +208,24 @@ const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d207 */,
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
  }
 };

+#if CONFIG_FILTERINTRA
+const vp9_prob vp9_default_filterintra_prob[TX_SIZES][VP9_INTRA_MODES] = {
+  // DC   V    H    D45  D135 D117 D153 D27  D63  TM
+    {160, 153, 171, 160, 140, 117, 115, 160, 160, 116},  // TX_4X4
+    {180, 151, 191, 180, 118,  66,  97, 180, 180, 120},  // TX_8X8
+    {200, 200, 200, 200, 200, 200, 200, 200, 200, 200},  // TX_16X16
+    {220, 220, 220, 220, 220, 220, 220, 220, 220, 220},  // TX_32X32
+};
+#endif
+
 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
-                                              [INTER_MODES - 1] = {
+                                              [VP9_INTER_MODES - 1] = {
  {2,       173,   34},  // 0 = both zero mv
  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
  {7,       166,   63},  // 2 = two predicted mvs
@@ -226,7 +236,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
 };

 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
  -DC_PRED, 2,                      /* 0 = DC_NODE */
  -TM_PRED, 4,                      /* 1 = TM_NODE */
  -V_PRED, 6,                       /* 2 = V_NODE */
@@ -235,7 +245,7 @@ const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
  -D45_PRED, 14,                    /* 6 = D45_NODE */
  -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
 };

 const vp9_tree_index vp9_inter_mode_tree[6] = {
@@ -250,8 +260,8 @@ const vp9_tree_index vp9_partition_tree[6] = {
  -PARTITION_VERT, -PARTITION_SPLIT
 };

-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
+struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
+struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];

 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

@@ -317,14 +327,32 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
  192, 128, 64
 };

-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
-                                                  [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1]
+                                                  [VP9_SWITCHABLE_FILTERS-1] = {
  { 235, 162, },
  { 36, 255, },
  { 34, 3, },
  { 149, 144, },
 };

+#if CONFIG_INTERINTRA
+static const vp9_prob default_interintra_prob[BLOCK_SIZE_TYPES] = {
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#if CONFIG_MASKED_INTERINTRA
+static const vp9_prob default_masked_interintra_prob[BLOCK_SIZE_TYPES] = {
+// 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#endif
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+static const vp9_prob default_masked_interinter_prob[BLOCK_SIZE_TYPES] = {
+    192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#endif
+
 void vp9_init_mbmode_probs(VP9_COMMON *cm) {
  vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs);
  vp9_copy(cm->fc.y_mode_prob, default_if_y_probs);
@@ -336,13 +364,25 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
  vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
  cm->fc.tx_probs = default_tx_probs;
  vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
+#if CONFIG_INTERINTRA
+  vp9_copy(cm->fc.interintra_prob, default_interintra_prob);
+#if CONFIG_MASKED_INTERINTRA
+  vp9_copy(cm->fc.masked_interintra_prob, default_masked_interintra_prob);
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  vp9_copy(cm->fc.filterintra_prob, vp9_default_filterintra_prob);
+#endif
+#if CONFIG_MASKED_INTERINTER
+  vp9_copy(cm->fc.masked_compound_prob, default_masked_interinter_prob);
+#endif
 }

-const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
  -EIGHTTAP, 2,
  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
+struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

 void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
@@ -400,17 +440,17 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                                             counts->single_ref[i][j]);

  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
+    update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
                      fc->inter_mode_probs[i], NEARESTMV);

  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
+    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
                      counts->y_mode[i], pre_fc->y_mode_prob[i],
                      fc->y_mode_prob[i], 0);

-  for (i = 0; i < INTRA_MODES; ++i)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
+  for (i = 0; i < VP9_INTRA_MODES; ++i)
+    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
                      fc->uv_mode_prob[i], 0);

@@ -421,8 +461,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                      fc->partition_prob[INTER_FRAME][i], 0);

  if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
-      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+      update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
                        counts->switchable_interp[i],
                        pre_fc->switchable_interp_prob[i],
                        fc->switchable_interp_prob[i], 0);
@@ -440,12 +480,14 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
                                             branch_ct_8x8p[j]);

-      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
+                                       branch_ct_16x16p);
      for (j = 0; j < TX_SIZES - 2; ++j)
        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
                                               branch_ct_16x16p[j]);

-      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
+                                       branch_ct_32x32p);
      for (j = 0; j < TX_SIZES - 1; ++j)
        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
                                               branch_ct_32x32p[j]);
@@ -455,6 +497,42 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
  for (i = 0; i < MBSKIP_CONTEXTS; ++i)
    fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
                                     counts->mbskip[i]);
+
+#if CONFIG_INTERINTRA
+  if (cm->use_interintra) {
+    for (i = 0; i < BLOCK_SIZE_TYPES; ++i) {
+      if (is_interintra_allowed(i))
+        fc->interintra_prob[i] = update_ct2(pre_fc->interintra_prob[i],
+                                            counts->interintra[i]);
+    }
+#if CONFIG_MASKED_INTERINTRA
+    if (cm->use_masked_interintra) {
+      for (i = 0; i < BLOCK_SIZE_TYPES; ++i) {
+        if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
+          fc->masked_interintra_prob[i] = update_ct2(
+                                          pre_fc->masked_interintra_prob[i],
+                                          counts->masked_interintra[i]);
+      }
+    }
+#endif
+  }
+#endif
+#if CONFIG_FILTERINTRA
+  for (i = 0; i < TX_SIZES; ++i)
+    for (j = 0; j < VP9_INTRA_MODES; ++j)
+      fc->filterintra_prob[i][j] = update_ct2(pre_fc->filterintra_prob[i][j],
+                                              counts->filterintra[i][j]);
+#endif
+#if CONFIG_MASKED_INTERINTER
+  if (cm->use_masked_compound) {
+    for (i = 0; i < BLOCK_SIZE_TYPES; ++i) {
+      if (get_mask_bits(i))
+        fc->masked_compound_prob[i] = update_ct2
+                                      (pre_fc->masked_compound_prob[i],
+                                       counts->masked_compound[i]);
+    }
+  }
+#endif
 }

 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -470,14 +548,14 @@ static void set_default_lf_deltas(struct loopfilter *lf) {
  lf->mode_deltas[1] = 0;
 }

-void vp9_setup_past_independence(VP9_COMMON *cm) {
+void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
  // Reset the segment feature data to the default stats:
  // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &cm->lf;
+  struct loopfilter *const lf = &xd->lf;

  int i;
-  vp9_clearall_segfeatures(&cm->seg);
-  cm->seg.abs_delta = SEGMENT_DELTADATA;
+  vp9_clearall_segfeatures(&xd->seg);
+  xd->seg.abs_delta = SEGMENT_DELTADATA;
  if (cm->last_frame_seg_map)
    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));

@@ -510,7 +588,10 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

  vp9_update_mode_info_border(cm, cm->mip);
+  vp9_update_mode_info_in_image(cm, cm->mi);
+
  vp9_update_mode_info_border(cm, cm->prev_mip);
+  vp9_update_mode_info_in_image(cm, cm->prev_mi);

  vp9_zero(cm->ref_frame_sign_bias);

--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -16,8 +16,20 @@

 #define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
-#define MODE_UPDATE_PROB  252
-#define SWITCHABLE_FILTERS 3   // number of switchable filters
+#define VP9_MODE_UPDATE_PROB  252
+#define VP9_SWITCHABLE_FILTERS 3   // number of switchable filters
+
+#if CONFIG_INTERINTRA
+#define VP9_UPD_INTERINTRA_PROB 248
+#define SEPARATE_INTERINTRA_UV  0
+#if CONFIG_MASKED_INTERINTRA
+#define VP9_UPD_MASKED_INTERINTRA_PROB 248
+#endif
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+#define VP9_UPD_MASKED_COMPOUND_PROB 248
+#endif

 // #define MODE_STATS

@@ -35,32 +47,32 @@ struct tx_counts {
  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };

-extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
-                                        [INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES]
+                                        [VP9_INTRA_MODES - 1];

 extern const vp9_tree_index vp9_intra_mode_tree[];
 extern const vp9_tree_index vp9_inter_mode_tree[];

-extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
+extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
+extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];

 // probability models for partition information
 extern const vp9_tree_index vp9_partition_tree[];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

 extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (SWITCHABLE_FILTERS - 1)];
+                 [2 * (VP9_SWITCHABLE_FILTERS - 1)];

-extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
+extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

 void vp9_entropy_mode_init();

-void vp9_setup_past_independence(struct VP9Common *cm);
+void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);

-void vp9_init_mbmode_probs(struct VP9Common *cm);
+void vp9_init_mbmode_probs(struct VP9Common *x);

-void vp9_adapt_mode_probs(struct VP9Common *cm);
+void vp9_adapt_mode_probs(struct VP9Common *);

 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]);
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -79,59 +79,20 @@ static const nmv_context default_nmv_context = {

 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)

-static const uint8_t log_in_base_2[] = {
-  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
-};
-
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
  MV_CLASS_TYPE c = MV_CLASS_0;
-  if (z >= CLASS0_SIZE * 4096)
-    c = MV_CLASS_10;
-  else
-    c = log_in_base_2[z >> 3];
-
+  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
+  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
+  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
+  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
+  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
+  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
+  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
+  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
+  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
+  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
+  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
+  else assert(0);
  if (offset)
    *offset = z - mv_class_base(c);
  return c;
@@ -149,6 +110,8 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                             int incr, int usehp) {
  int s, z, c, o, d, e, f;
+  if (!incr)
+    return;
  assert (v != 0);            /* should not be zero */
  s = v < 0;
  comp_counts->sign[s] += incr;
@@ -160,39 +123,61 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
  d = (o >> 3);               /* int mv data */
  f = (o >> 1) & 3;           /* fractional pel mv data */
  e = (o & 1);                /* high precision mv data */
-
  if (c == MV_CLASS_0) {
    comp_counts->class0[d] += incr;
-    comp_counts->class0_fp[d][f] += incr;
-    comp_counts->class0_hp[e] += usehp * incr;
  } else {
    int i;
    int b = c + CLASS0_BITS - 1;  // number of bits
    for (i = 0; i < b; ++i)
      comp_counts->bits[i][((d >> i) & 1)] += incr;
+  }
+
+  /* Code the fractional pel bits */
+  if (c == MV_CLASS_0) {
+    comp_counts->class0_fp[d][f] += incr;
+  } else {
    comp_counts->fp[f] += incr;
-    comp_counts->hp[e] += usehp * incr;
+  }
+
+  /* Code the high precision bit */
+  if (usehp) {
+    if (c == MV_CLASS_0) {
+      comp_counts->class0_hp[e] += incr;
+    } else {
+      comp_counts->hp[e] += incr;
+    }
  }
 }

+static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
+  int v;
+  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
+  for (v = 1; v <= MV_MAX; v++) {
+    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
+    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
+  }
+}

 void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
  const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
  ++counts->joints[j];

-  if (mv_joint_vertical(j)) {
-    inc_mv_component(mv->row, &counts->comps[0], 1, 1);
-  }
+  if (mv_joint_vertical(j))
+    ++counts->comps[0].mvcount[MV_MAX + mv->row];

-  if (mv_joint_horizontal(j)) {
-    inc_mv_component(mv->col, &counts->comps[1], 1, 1);
-  }
+  if (mv_joint_horizontal(j))
+    ++counts->comps[1].mvcount[MV_MAX + mv->col];
 }

 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }

+void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
+  counts_to_context(&nmv_count->comps[0], usehp);
+  counts_to_context(&nmv_count->comps[1], usehp);
+}
+
 static unsigned int adapt_probs(unsigned int i,
                                vp9_tree tree,
                                vp9_prob this_probs[],
@@ -222,6 +207,8 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
  nmv_context *pre_ctx = &pre_fc->nmvc;
  nmv_context_counts *cts = &cm->counts.mv;

+  vp9_counts_process(cts, allow_hp);
+
  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);

  for (i = 0; i < 2; ++i) {
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -24,7 +24,7 @@ void vp9_init_mv_probs(struct VP9Common *cm);
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
 int vp9_use_mv_hp(const MV *ref);

-#define NMV_UPDATE_PROB  252
+#define VP9_NMV_UPDATE_PROB  252

 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS     4
@@ -126,4 +126,6 @@ typedef struct {

 void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);

+void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
+
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -13,16 +13,16 @@

 #include "./vpx_config.h"

-#define MI_SIZE_LOG2 3
-#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
+#define LOG2_MI_SIZE 3
+#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE)  // 64 = 2^6

-#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
-#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block
+#define MAX_BLOCK_SIZE (1 << 6)  // max block size in pixel
+#define MI_SIZE (1 << LOG2_MI_SIZE)  // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE)  // mi-units per max block

 #define MI_MASK (MI_BLOCK_SIZE - 1)

-
-typedef enum BLOCK_SIZE {
+typedef enum BLOCK_SIZE_TYPE {
  BLOCK_4X4,
  BLOCK_4X8,
  BLOCK_8X4,
@@ -36,17 +36,15 @@ typedef enum BLOCK_SIZE {
  BLOCK_32X64,
  BLOCK_64X32,
  BLOCK_64X64,
-  BLOCK_SIZES,
-  BLOCK_INVALID = BLOCK_SIZES
-} BLOCK_SIZE;
+  BLOCK_SIZE_TYPES
+} BLOCK_SIZE_TYPE;

 typedef enum PARTITION_TYPE {
  PARTITION_NONE,
  PARTITION_HORZ,
  PARTITION_VERT,
  PARTITION_SPLIT,
-  PARTITION_TYPES,
-  PARTITION_INVALID = PARTITION_TYPES
+  PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;

 #define PARTITION_PLOFFSET   4  // number of probability models per block size
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c
@@ -57,23 +57,15 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,

 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                               YV12_BUFFER_CONFIG *dst) {
-  // Extend src frame in buffer
-  // Altref filtering assumes 16 pixel extension
-  const int et_y = 16;
-  const int el_y = 16;
-  // Motion estimation may use src block variance with the block size up
-  // to 64x64, so the right and bottom need to be extended to 64 mulitple
-  // or up to 16, whichever is greater.
-  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
-                       16);
-  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
-                       16);
-  const int uv_width_subsampling = (src->uv_width != src->y_width);
-  const int uv_height_subsampling = (src->uv_height != src->y_height);
-  const int et_uv = et_y >> uv_height_subsampling;
-  const int el_uv = el_y >> uv_width_subsampling;
-  const int eb_uv = eb_y >> uv_height_subsampling;
-  const int er_uv = er_y >> uv_width_subsampling;
+  const int et_y = dst->border;
+  const int el_y = dst->border;
+  const int eb_y = dst->border + dst->y_height - src->y_height;
+  const int er_y = dst->border + dst->y_width - src->y_width;
+
+  const int et_uv = dst->border >> (dst->uv_height != dst->y_height);
+  const int el_uv = dst->border >> (dst->uv_width != dst->y_width);
+  const int eb_uv = et_uv + dst->uv_height - src->uv_height;
+  const int er_uv = el_uv + dst->uv_width - src->uv_width;

 #if CONFIG_ALPHA
  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -8,12 +8,14 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/mem.h"

+#include <stdlib.h>
 #include "vp9/common/vp9_filter.h"
+#include "vpx_ports/mem.h"
+#include "vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"

-DECLARE_ALIGNED(256, const int16_t,
-                vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
  { 0, 0, 0, 128,   0, 0, 0, 0 },
  { 0, 0, 0, 120,   8, 0, 0, 0 },
  { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -32,9 +34,8 @@ DECLARE_ALIGNED(256, const int16_t,
  { 0, 0, 0,   8, 120, 0, 0, 0 }
 };

-// Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
+  /* Lagrangian interpolation filter */
  { 0,   0,   0, 128,   0,   0,   0,  0},
  { 0,   1,  -5, 126,   8,  -3,   1,  0},
  { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -53,9 +54,9 @@ DECLARE_ALIGNED(256, const int16_t,
  { 0,   1,  -3,   8, 126,  -5,   1,  0}
 };

-// DCT based filter
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
+    = {
+  /* dct based filter */
  {0,   0,   0, 128,   0,   0,   0, 0},
  {-1,   3,  -7, 127,   8,  -3,   1, 0},
  {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -74,9 +75,9 @@ DECLARE_ALIGNED(256, const int16_t,
  {0,   1,  -3,   8, 127,  -7,   3, -1}
 };

-// freqmultiplier = 0.5
 DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
+  /* freqmultiplier = 0.5 */
  { 0,  0,  0, 128,  0,  0,  0,  0},
  {-3, -1, 32,  64, 38,  1, -3,  0},
  {-2, -2, 29,  63, 41,  2, -3,  0},
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -12,22 +12,26 @@
 #define VP9_COMMON_VP9_FILTER_H_

 #include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"

-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP9_FILTER_WEIGHT 128
+#define VP9_FILTER_SHIFT  7

-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS];
+#define SUBPEL_SHIFTS 16
+
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];

 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
-#define BILINEAR_FILTERS_2TAP(x) \
-  (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
+#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
+                   sizeof(vp9_bilinear_filters[0][0]))
+#define BF_OFFSET (BF_LENGTH / 2 - 1)
+#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)

 #endif  // VP9_COMMON_VP9_FILTER_H_
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -8,8 +8,11 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <limits.h>
+
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_sadmxn.h"

 static void lower_mv_precision(MV *mv, int allow_hp) {
  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
@@ -43,14 +46,17 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   int mi_row, int mi_col) {
  int_mv dst_list[MAX_MV_REF_CANDIDATES];
  int_mv mv_list[MAX_MV_REF_CANDIDATES];
-  MODE_INFO *const mi = xd->this_mi;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;

  assert(ref_idx == 0 || ref_idx == 1);
  assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier

-  vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi,
-                       mi->mbmi.ref_frame[ref_idx],
-                       mv_list, block_idx, mi_row, mi_col);
+  vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
+                       xd->prev_mode_info_context,
+                       mbmi->ref_frame[ref_idx],
+                       mv_list, cm->ref_frame_sign_bias, block_idx,
+                       mi_row, mi_col);

  dst_list[1].as_int = 0;
  if (block_idx == 0) {
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -36,57 +36,48 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }

-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
                                   MACROBLOCKD *xd,
                                   int_mv *dst_nearest,
                                   int_mv *dst_near,
                                   int block_idx, int ref_idx,
                                   int mi_row, int mi_col);

-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
-                                          const MODE_INFO *left_mb, int b) {
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
  // understand this condition. This will go away soon.
-  const MODE_INFO *mi = cur_mb;
-
  if (b == 0 || b == 2) {
    /* On L edge, get from MB to left of us */
-    mi = left_mb;
-    if (!mi)
-      return DC_PRED;
+    --cur_mb;

-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&cur_mb->mbmi)) {
      return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 1 + b)->as_mode);
+    } else if (cur_mb->mbmi.sb_type < BLOCK_8X8) {
+      return (cur_mb->bmi + 1 + b)->as_mode;
    } else {
-      return mi->mbmi.mode;
+      return cur_mb->mbmi.mode;
    }
  }
  assert(b == 1 || b == 3);
-  return (mi->bmi + b - 1)->as_mode;
+  return (cur_mb->bmi + b - 1)->as_mode;
 }

 static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
-                                           const MODE_INFO *above_mb, int b) {
-  const MODE_INFO *mi = cur_mb;
-
+                                          int b, int mi_stride) {
  if (!(b >> 1)) {
    /* On top edge, get from MB above us */
-    mi = above_mb;
-    if (!mi)
-      return DC_PRED;
+    cur_mb -= mi_stride;

-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&cur_mb->mbmi)) {
      return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 2 + b)->as_mode);
+    } else if (cur_mb->mbmi.sb_type < BLOCK_8X8) {
+      return (cur_mb->bmi + 2 + b)->as_mode;
    } else {
-      return mi->mbmi.mode;
+      return cur_mb->mbmi.mode;
    }
  }

-  return (mi->bmi + b - 2)->as_mode;
+  return (cur_mb->bmi + b - 2)->as_mode;
 }

 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -27,9 +27,6 @@
 #define pair_set_epi16(a, b) \
  _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))

-#define pair_set_epi32(a, b) \
-  _mm_set_epi32(b, a, b, a)
-
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -22,217 +22,13 @@ struct loop_filter_info {
  const uint8_t *hev_thr;
 };

-// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.   Int_ entries refer to whether or not to
-// apply borders on the 4x4 edges within the 8x8 block that each bit
-// represents.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  uint64_t left_y[TX_SIZES];
-  uint64_t above_y[TX_SIZES];
-  uint64_t int_4x4_y;
-  uint16_t left_uv[TX_SIZES];
-  uint16_t above_uv[TX_SIZES];
-  uint16_t int_4x4_uv;
-} LOOP_FILTER_MASK;
-
-// 64 bit masks for left transform size.  Each 1 represents a position where
-// we should apply a loop filter across the left border of an 8x8 block
-// boundary.
-//
-// In the case of TX_16X16->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//
-// A loopfilter should be applied to every other 8x8 horizontally.
-static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
-    0xffffffffffffffff,  // TX_4X4
-    0xffffffffffffffff,  // TX_8x8
-    0x5555555555555555,  // TX_16x16
-    0x1111111111111111,  // TX_32x32
-};
-
-// 64 bit masks for above transform size.  Each 1 represents a position where
-// we should apply a loop filter across the top border of an 8x8 block
-// boundary.
-//
-// In the case of TX_32x32 ->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    11111111
-//    00000000
-//    00000000
-//    00000000
-//    11111111
-//    00000000
-//    00000000
-//    00000000
-//
-// A loopfilter should be applied to every other 4 the row vertically.
-static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
-    0xffffffffffffffff,  // TX_4X4
-    0xffffffffffffffff,  // TX_8x8
-    0x00ff00ff00ff00ff,  // TX_16x16
-    0x000000ff000000ff,  // TX_32x32
-};
-
-// 64 bit masks for prediction sizes (left).  Each 1 represents a position
-// where left border of an 8x8 block.  These are aligned to the right most
-// appropriate bit,  and then shifted into place.
-//
-// In the case of TX_16x32 ->  ( low order byte first ) we end up with
-// a mask that looks like this :
-//
-//  10000000
-//  10000000
-//  10000000
-//  10000000
-//  00000000
-//  00000000
-//  00000000
-//  00000000
-static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
-    0x0000000000000001,  // BLOCK_4X4,
-    0x0000000000000001,  // BLOCK_4X8,
-    0x0000000000000001,  // BLOCK_8X4,
-    0x0000000000000001,  // BLOCK_8X8,
-    0x0000000000000101,  // BLOCK_8X16,
-    0x0000000000000001,  // BLOCK_16X8,
-    0x0000000000000101,  // BLOCK_16X16,
-    0x0000000001010101,  // BLOCK_16X32,
-    0x0000000000000101,  // BLOCK_32X16,
-    0x0000000001010101,  // BLOCK_32X32,
-    0x0101010101010101,  // BLOCK_32X64,
-    0x0000000001010101,  // BLOCK_64X32,
-    0x0101010101010101,  // BLOCK_64X64
-};
-
-// 64 bit mask to shift and set for each prediction size.
-static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
-    0x0000000000000001,  // BLOCK_4X4
-    0x0000000000000001,  // BLOCK_4X8
-    0x0000000000000001,  // BLOCK_8X4
-    0x0000000000000001,  // BLOCK_8X8
-    0x0000000000000001,  // BLOCK_8X16,
-    0x0000000000000003,  // BLOCK_16X8
-    0x0000000000000003,  // BLOCK_16X16
-    0x0000000000000003,  // BLOCK_16X32,
-    0x000000000000000f,  // BLOCK_32X16,
-    0x000000000000000f,  // BLOCK_32X32,
-    0x000000000000000f,  // BLOCK_32X64,
-    0x00000000000000ff,  // BLOCK_64X32,
-    0x00000000000000ff,  // BLOCK_64X64
-};
-// 64 bit mask to shift and set for each prediction size.  A bit is set for
-// each 8x8 block that would be in the left most block of the given block
-// size in the 64x64 block.
-static const uint64_t size_mask[BLOCK_SIZES] = {
-    0x0000000000000001,  // BLOCK_4X4
-    0x0000000000000001,  // BLOCK_4X8
-    0x0000000000000001,  // BLOCK_8X4
-    0x0000000000000001,  // BLOCK_8X8
-    0x0000000000000101,  // BLOCK_8X16,
-    0x0000000000000003,  // BLOCK_16X8
-    0x0000000000000303,  // BLOCK_16X16
-    0x0000000003030303,  // BLOCK_16X32,
-    0x0000000000000f0f,  // BLOCK_32X16,
-    0x000000000f0f0f0f,  // BLOCK_32X32,
-    0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
-    0x00000000ffffffff,  // BLOCK_64X32,
-    0xffffffffffffffff,  // BLOCK_64X64
-};
-
-// These are used for masking the left and above borders.
-static const uint64_t left_border =  0x1111111111111111;
-static const uint64_t above_border = 0x000000ff000000ff;
-
-// 16 bit masks for uv transform sizes.
-static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
-    0xffff,  // TX_4X4
-    0xffff,  // TX_8x8
-    0x5555,  // TX_16x16
-    0x1111,  // TX_32x32
-};
-
-static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
-    0xffff,  // TX_4X4
-    0xffff,  // TX_8x8
-    0x0f0f,  // TX_16x16
-    0x000f,  // TX_32x32
-};
-
-// 16 bit left mask to shift and set for each uv prediction size.
-static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
-    0x0001,  // BLOCK_4X4,
-    0x0001,  // BLOCK_4X8,
-    0x0001,  // BLOCK_8X4,
-    0x0001,  // BLOCK_8X8,
-    0x0001,  // BLOCK_8X16,
-    0x0001,  // BLOCK_16X8,
-    0x0001,  // BLOCK_16X16,
-    0x0011,  // BLOCK_16X32,
-    0x0001,  // BLOCK_32X16,
-    0x0011,  // BLOCK_32X32,
-    0x1111,  // BLOCK_32X64
-    0x0011,  // BLOCK_64X32,
-    0x1111,  // BLOCK_64X64
-};
-// 16 bit above mask to shift and set for uv each prediction size.
-static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
-    0x0001,  // BLOCK_4X4
-    0x0001,  // BLOCK_4X8
-    0x0001,  // BLOCK_8X4
-    0x0001,  // BLOCK_8X8
-    0x0001,  // BLOCK_8X16,
-    0x0001,  // BLOCK_16X8
-    0x0001,  // BLOCK_16X16
-    0x0001,  // BLOCK_16X32,
-    0x0003,  // BLOCK_32X16,
-    0x0003,  // BLOCK_32X32,
-    0x0003,  // BLOCK_32X64,
-    0x000f,  // BLOCK_64X32,
-    0x000f,  // BLOCK_64X64
-};
-
-// 64 bit mask to shift and set for each uv prediction size
-static const uint16_t size_mask_uv[BLOCK_SIZES] = {
-    0x0001,  // BLOCK_4X4
-    0x0001,  // BLOCK_4X8
-    0x0001,  // BLOCK_8X4
-    0x0001,  // BLOCK_8X8
-    0x0001,  // BLOCK_8X16,
-    0x0001,  // BLOCK_16X8
-    0x0001,  // BLOCK_16X16
-    0x0011,  // BLOCK_16X32,
-    0x0003,  // BLOCK_32X16,
-    0x0033,  // BLOCK_32X32,
-    0x3333,  // BLOCK_32X64,
-    0x00ff,  // BLOCK_64X32,
-    0xffff,  // BLOCK_64X64
-};
-static const uint16_t left_border_uv =  0x1111;
-static const uint16_t above_border_uv = 0x000f;
-
-
 static void lf_init_lut(loop_filter_info_n *lfi) {
  lfi->mode_lf_lut[DC_PRED] = 0;
  lfi->mode_lf_lut[D45_PRED] = 0;
  lfi->mode_lf_lut[D135_PRED] = 0;
  lfi->mode_lf_lut[D117_PRED] = 0;
  lfi->mode_lf_lut[D153_PRED] = 0;
-  lfi->mode_lf_lut[D207_PRED] = 0;
+  lfi->mode_lf_lut[D27_PRED] = 0;
  lfi->mode_lf_lut[D63_PRED] = 0;
  lfi->mode_lf_lut[V_PRED] = 0;
  lfi->mode_lf_lut[H_PRED] = 0;
@@ -243,7 +39,7 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
  lfi->mode_lf_lut[NEWMV] = 1;
 }

-static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
  int lvl;

  // For each possible value for the loop filter fill out limits
@@ -265,9 +61,8 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
  }
 }

-void vp9_loop_filter_init(VP9_COMMON *cm) {
+void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
  loop_filter_info_n *lfi = &cm->lf_info;
-  struct loopfilter *lf = &cm->lf;
  int i;

  // init limits for given sharpness
@@ -282,15 +77,16 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
 }

-void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
+void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                int default_filt_lvl) {
  int seg_id;
  // n_shift is the a multiplier for lf_deltas
  // the multiplier is 1 for when filter_lvl is between 0 and 31;
  // 2 when filter_lvl is between 32 and 63
  const int n_shift = default_filt_lvl >> 5;
  loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *const lf = &cm->lf;
-  struct segmentation *const seg = &cm->seg;
+  struct loopfilter *const lf = &xd->lf;
+  struct segmentation *const seg = &xd->seg;

  // update limits if sharpness has changed
  if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -302,7 +98,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;

    // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
+    if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) {
      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
                  ? data
@@ -312,7 +108,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
    if (!lf->mode_ref_delta_enabled) {
      // we could get rid of this if we assume that deltas are set to
      // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
+      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
      continue;
    }

@@ -328,9 +124,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
  }
 }

-static int build_lfi(const loop_filter_info_n *lfi_n,
-                     const MB_MODE_INFO *mbmi,
-                     struct loop_filter_info *lfi) {
+static int build_lfi(const loop_filter_info_n *const lfi_n,
+                     const MB_MODE_INFO *const mbmi,
+                     struct loop_filter_info *const lfi) {
  const int seg = mbmi->segment_id;
  const int ref = mbmi->ref_frame[0];
  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
@@ -440,360 +236,10 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
  }
 }

-// This function ors into the current lfm structure, where to do loop
-// filters for the specific mi we are looking at.   It uses information
-// including the block_size_type (32x16, 32x32, etc),  the transform size,
-// whether there were any coefficients encoded, and the loop filter strength
-// block we are currently looking at. Shift is used to position the
-// 1's we produce.
-// TODO(JBB) Need another function for different resolution color..
-static void build_masks(const loop_filter_info_n *const lfi_n,
-                        const MODE_INFO *mi, const int shift_y,
-                        const int shift_uv,
-                        LOOP_FILTER_MASK *lfm) {
-  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
-  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
-  const int skip = mi->mbmi.skip_coeff;
-  const int seg = mi->mbmi.segment_id;
-  const int ref = mi->mbmi.ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
-  uint64_t *left_y = &lfm->left_y[tx_size_y];
-  uint64_t *above_y = &lfm->above_y[tx_size_y];
-  uint64_t *int_4x4_y = &lfm->int_4x4_y;
-  uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
-  uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
-  uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
-
-  // If filter level is 0 we don't loop filter.
-  if (!filter_level)
-    return;
-
-  // These set 1 in the current block size for the block size edges.
-  // For instance if the block size is 32x16,   we'll set :
-  //    above =   1111
-  //              0000
-  //    and
-  //    left  =   1000
-  //          =   1000
-  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
-  //        1,  not 8...
-  //
-  // U and v set things on a 16 bit scale.
-  //
-  *above_y |= above_prediction_mask[block_size] << shift_y;
-  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
-  *left_y |= left_prediction_mask[block_size] << shift_y;
-  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
-
-  // If the block has no coefficients and is not intra we skip applying
-  // the loop filter on block edges.
-  if (skip && ref > INTRA_FRAME)
-    return;
-
-  // Here we are adding a mask for the transform size.  The transform
-  // size mask is set to be correct for a 64x64 prediction block size. We
-  // mask to match the size of the block we are working on and then shift it
-  // into place..
-  *above_y |= (size_mask[block_size] &
-               above_64x64_txform_mask[tx_size_y]) << shift_y;
-  *above_uv |= (size_mask_uv[block_size] &
-                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
-
-  *left_y |= (size_mask[block_size] &
-              left_64x64_txform_mask[tx_size_y]) << shift_y;
-  *left_uv |= (size_mask_uv[block_size] &
-               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
-
-  // Here we are trying to determine what to do with the internal 4x4 block
-  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
-  // an 8x8 in that the internal ones can be skipped and don't depend on
-  // the prediction block size.
-  if (tx_size_y == TX_4X4) {
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
-  }
-  if (tx_size_uv == TX_4X4) {
-    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
-  }
-}
-
-// This function does the same thing as the one above with the exception that
-// it only affects the y masks.   It exists because for blocks < 16x16 in size,
-// we only update u and v masks on the first block.
-static void build_y_mask(const loop_filter_info_n *const lfi_n,
-                         const MODE_INFO *mi, const int shift_y,
-                         LOOP_FILTER_MASK *lfm) {
-  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
-  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const int skip = mi->mbmi.skip_coeff;
-  const int seg = mi->mbmi.segment_id;
-  const int ref = mi->mbmi.ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
-  uint64_t *left_y = &lfm->left_y[tx_size_y];
-  uint64_t *above_y = &lfm->above_y[tx_size_y];
-  uint64_t *int_4x4_y = &lfm->int_4x4_y;
-
-  if (!filter_level)
-    return;
-
-  *above_y |= above_prediction_mask[block_size] << shift_y;
-  *left_y |= left_prediction_mask[block_size] << shift_y;
-
-  if (skip && ref > INTRA_FRAME)
-    return;
-
-  *above_y |= (size_mask[block_size] &
-               above_64x64_txform_mask[tx_size_y]) << shift_y;
-
-  *left_y |= (size_mask[block_size] &
-              left_64x64_txform_mask[tx_size_y]) << shift_y;
-
-  if (tx_size_y == TX_4X4) {
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
-  }
-}
-
-// This function sets up the bit masks for the entire 64x64 region represented
-// by mi_row, mi_col.
-// TODO(JBB): This function only works for yv12.
-static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                       MODE_INFO **mi_8x8, const int mode_info_stride,
-                       LOOP_FILTER_MASK *lfm) {
-  int idx_32, idx_16, idx_8;
-  const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO **mip = mi_8x8;
-  MODE_INFO **mip2 = mi_8x8;
-
-  // These are offsets to the next mi in the 64x64 block. It is what gets
-  // added to the mi ptr as we go through each loop.  It helps us to avoids
-  // setting up special row and column counters for each index.  The last step
-  // brings us out back to the starting position.
-  const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
-                           -(mode_info_stride << 2) - 4};
-  const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
-                           -(mode_info_stride << 1) - 2};
-  const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
-
-  // Following variables represent shifts to position the current block
-  // mask over the appropriate block.   A shift of 36 to the left will move
-  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
-  // 4 rows to the appropriate spot.
-  const int shift_32_y[] = {0, 4, 32, 36};
-  const int shift_16_y[] = {0, 2, 16, 18};
-  const int shift_8_y[] = {0, 1, 8, 9};
-  const int shift_32_uv[] = {0, 2, 8, 10};
-  const int shift_16_uv[] = {0, 1, 4, 5};
-  int i;
-  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
-                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
-  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
-                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
-
-  vp9_zero(*lfm);
-
-  // TODO(jimbankoski): Try moving most of the following code into decode
-  // loop and storing lfm in the mbmi structure so that we don't have to go
-  // through the recursive loop structure multiple times.
-  switch (mip[0]->mbmi.sb_type) {
-    case BLOCK_64X64:
-      build_masks(lfi_n, mip[0] , 0, 0, lfm);
-      break;
-    case BLOCK_64X32:
-      build_masks(lfi_n, mip[0], 0, 0, lfm);
-      mip2 = mip + mode_info_stride * 4;
-      if (4 >= max_rows)
-        break;
-      build_masks(lfi_n, mip2[0], 32, 8, lfm);
-      break;
-    case BLOCK_32X64:
-      build_masks(lfi_n, mip[0], 0, 0, lfm);
-      mip2 = mip + 4;
-      if (4 >= max_cols)
-        break;
-      build_masks(lfi_n, mip2[0], 4, 2, lfm);
-      break;
-    default:
-      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
-        const int shift_y = shift_32_y[idx_32];
-        const int shift_uv = shift_32_uv[idx_32];
-        const int mi_32_col_offset = ((idx_32 & 1) << 2);
-        const int mi_32_row_offset = ((idx_32 >> 1) << 2);
-        if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
-          continue;
-        switch (mip[0]->mbmi.sb_type) {
-          case BLOCK_32X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
-            break;
-          case BLOCK_32X16:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
-            if (mi_32_row_offset + 2 >= max_rows)
-              continue;
-            mip2 = mip + mode_info_stride * 2;
-            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
-            break;
-          case BLOCK_16X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
-            if (mi_32_col_offset + 2 >= max_cols)
-              continue;
-            mip2 = mip + 2;
-            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
-            break;
-          default:
-            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
-              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
-              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
-              const int mi_16_col_offset = mi_32_col_offset +
-                  ((idx_16 & 1) << 1);
-              const int mi_16_row_offset = mi_32_row_offset +
-                  ((idx_16 >> 1) << 1);
-
-              if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
-                continue;
-
-              switch (mip[0]->mbmi.sb_type) {
-                case BLOCK_16X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
-                  break;
-                case BLOCK_16X8:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
-                  if (mi_16_row_offset + 1 >= max_rows)
-                    continue;
-                  mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
-                  break;
-                case BLOCK_8X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
-                  if (mi_16_col_offset +1 >= max_cols)
-                    continue;
-                  mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
-                  break;
-                default: {
-                  const int shift_y = shift_32_y[idx_32] +
-                                      shift_16_y[idx_16] +
-                                      shift_8_y[0];
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
-                  mip += offset[0];
-                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
-                    const int shift_y = shift_32_y[idx_32] +
-                                        shift_16_y[idx_16] +
-                                        shift_8_y[idx_8];
-                    const int mi_8_col_offset = mi_16_col_offset +
-                        ((idx_8 & 1));
-                    const int mi_8_row_offset = mi_16_row_offset +
-                        ((idx_8 >> 1));
-
-                    if (mi_8_col_offset >= max_cols ||
-                        mi_8_row_offset >= max_rows)
-                      continue;
-                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
-                  }
-                  break;
-                }
-              }
-            }
-            break;
-        }
-      }
-      break;
-  }
-  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also also.
-  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
-  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
-  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
-  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
-
-  // We do at least 8 tap filter on every 32x32 even if the transform size
-  // is 4x4.  So if the 4x4 is set on a border pixel add it to the 8x8 and
-  // remove it from the 4x4.
-  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
-  lfm->left_y[TX_4X4] &= ~left_border;
-  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
-  lfm->above_y[TX_4X4] &= ~above_border;
-  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
-  lfm->left_uv[TX_4X4] &= ~left_border_uv;
-  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
-  lfm->above_uv[TX_4X4] &= ~above_border_uv;
-
-  // We do some special edge handling.
-  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
-    const uint64_t rows = cm->mi_rows - mi_row;
-
-    // Each pixel inside the border gets a 1,
-    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
-    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
-
-    // Remove values completely outside our border.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->int_4x4_uv &= mask_uv;
-
-    // We don't apply a wide loop filter on the last uv block row.  If set
-    // apply the shorter one instead.
-    if (rows == 1) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
-      lfm->above_uv[TX_16X16] = 0;
-    }
-    if (rows == 5) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
-      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
-    }
-  }
-
-  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
-    const uint64_t columns = cm->mi_cols - mi_col;
-
-    // Each pixel inside the border gets a 1, the multiply copies the border
-    // to where we need it.
-    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
-    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
-
-    // Internal edges are not applied on the last column of the image so
-    // we mask 1 more for the internal edges
-    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
-
-    // Remove the bits outside the image edge.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->int_4x4_uv &= mask_uv_int;
-
-    // We don't apply a wide loop filter on the last uv column.  If set
-    // apply the shorter one instead.
-    if (columns == 1) {
-      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
-      lfm->left_uv[TX_16X16] = 0;
-    }
-    if (columns == 5) {
-      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
-      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
-    }
-  }
-  // We don't a loop filter on the first column in the image.  Mask that out.
-  if (mi_col == 0) {
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefe;
-      lfm->left_uv[i] &= 0xeeee;
-    }
-  }
-}
-#if CONFIG_NON420
-static void filter_block_plane_non420(VP9_COMMON *cm,
-                                      struct macroblockd_plane *plane,
-                                      MODE_INFO **mi_8x8,
-                                      int mi_row, int mi_col) {
+static void filter_block_plane(VP9_COMMON *const cm,
+                               struct macroblockd_plane *const plane,
+                               const MODE_INFO *mi,
+                               int mi_row, int mi_col) {
  const int ss_x = plane->subsampling_x;
  const int ss_y = plane->subsampling_y;
  const int row_step = 1 << ss_x;
@@ -816,25 +262,24 @@ static void filter_block_plane_non420(VP9_COMMON *cm,

    // Determine the vertical edges that need filtering
    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c];
-      const int skip_this = mi[0].mbmi.skip_coeff
-                            && is_inter_block(&mi[0].mbmi);
+      const int skip_this = mi[c].mbmi.mb_skip_coeff
+                            && is_inter_block(&mi[c].mbmi);
      // left edge of current unit is block/partition edge -> no skip
-      const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ?
-          !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
+          !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
      const int skip_this_c = skip_this && !block_edge_left;
      // top edge of current unit is block/partition edge -> no skip
-      const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ?
-          !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?
+          !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
      const int skip_this_r = skip_this && !block_edge_above;
      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi)
-                            : mi[0].mbmi.tx_size;
+                            ? get_uv_tx_size(&mi[c].mbmi)
+                            : mi[c].mbmi.txfm_size;
      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

      // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
        continue;

      // Build masks based on the transform size of each block
@@ -893,7 +338,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                            mask_4x4_c & border_mask,
                            mask_4x4_int[r], lfi[r]);
    dst->buf += 8 * dst->stride;
-    mi_8x8 += row_step_stride;
+    mi += row_step_stride;
  }

  // Now do horizontal pass
@@ -910,146 +355,33 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
    dst->buf += 8 * dst->stride;
  }
 }
-#endif
-
-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               MODE_INFO **mi_8x8,
-                               int mi_row, int mi_col,
-                               LOOP_FILTER_MASK *lfm) {
-  const int ss_x = plane->subsampling_x;
-  const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
-  const int row_step_stride = cm->mode_info_stride * row_step;
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t* const dst0 = dst->buf;
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
-  int r, c;
-  int row_shift = 3 - ss_x;
-  int row_mask = 0xff >> (ss_x << 2);
-
-#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
-
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    int r_sampled = r >> ss_x;
-
-    // Determine the vertical edges that need filtering
-    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c];
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
-        continue;
-    }
-    if (!plane->plane_type) {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
-      // Disable filtering on the leftmost column
-      filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_y[TX_16X16]),
-                              MASK_ROW(lfm->left_y[TX_8X8]),
-                              MASK_ROW(lfm->left_y[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_y),
-                              lfi[r]);
-    } else {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
-      // Disable filtering on the leftmost column
-      filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_uv[TX_16X16]),
-                              MASK_ROW(lfm->left_uv[TX_8X8]),
-                              MASK_ROW(lfm->left_uv[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_uv),
-                              lfi[r]);
-    }
-    dst->buf += 8 * dst->stride;
-    mi_8x8 += row_step_stride;
-  }
-
-  // Now do horizontal pass
-  dst->buf = dst0;
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
-    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
-    int r_sampled = r >> ss_x;
-
-    if (!plane->plane_type) {
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_y[TX_16X16]),
-                               MASK_ROW(lfm->above_y[TX_8X8]),
-                               MASK_ROW(lfm->above_y[TX_4X4]),
-                               MASK_ROW(lfm->int_4x4_y),
-                               mi_row + r == 0, lfi[r]);
-    } else {
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_uv[TX_16X16]),
-                               MASK_ROW(lfm->above_uv[TX_8X8]),
-                               MASK_ROW(lfm->above_uv[TX_4X4]),
-                               mask_4x4_int_r,
-                               mi_row + r == 0, lfi[r]);
-    }
-    dst->buf += 8 * dst->stride;
-  }
-#undef MASK_ROW
-}

 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
                          VP9_COMMON *cm, MACROBLOCKD *xd,
                          int start, int stop, int y_only) {
  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
  int mi_row, mi_col;
-  LOOP_FILTER_MASK lfm;
-#if CONFIG_NON420
-  int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
-      xd->plane[1].subsampling_x == 1);
-#endif

  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
-    MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
+    MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;

    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
      int plane;

      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
-
-      // TODO(JBB): Make setup_mask work for non 420.
-#if CONFIG_NON420
-      if (use_420)
-#endif
-        setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
-                   &lfm);
-
      for (plane = 0; plane < num_planes; ++plane) {
-#if CONFIG_NON420
-        if (use_420)
-#endif
-          filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row,
-                             mi_col, &lfm);
-#if CONFIG_NON420
-        else
-          filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
-                                    mi_row, mi_col);
-#endif
+        filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
      }
    }
  }
 }

 void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
-                           int frame_filter_level,
-                           int y_only, int partial) {
-  int start_mi_row, end_mi_row, mi_rows_to_filter;
+                           int frame_filter_level, int y_only) {
  if (!frame_filter_level) return;
-  start_mi_row = 0;
-  mi_rows_to_filter = cm->mi_rows;
-  if (partial && cm->mi_rows > 8) {
-    start_mi_row = cm->mi_rows >> 1;
-    start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
-  }
-  end_mi_row = start_mi_row + mi_rows_to_filter;
-  vp9_loop_filter_frame_init(cm, frame_filter_level);
+  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
  vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
-                       start_mi_row, end_mi_row,
-                       y_only);
+                       0, cm->mi_rows, y_only);
 }

 int vp9_loop_filter_worker(void *arg1, void *arg2) {
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -22,27 +22,6 @@

 #define SIMD_WIDTH 16

-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      2
-
-struct loopfilter {
-  int filter_level;
-
-  int sharpness_level;
-  int last_sharpness_level;
-
-  uint8_t mode_ref_delta_enabled;
-  uint8_t mode_ref_delta_update;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char ref_deltas[MAX_REF_LF_DELTAS];
-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
-
-  // 0 = ZERO_MV, MV
-  signed char mode_deltas[MAX_MODE_LF_DELTAS];
-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
-};
-
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
@@ -60,17 +39,19 @@ typedef struct {
 struct VP9Common;
 struct macroblockd;

-void vp9_loop_filter_init(struct VP9Common *cm);
+void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf);

 // Update the loop filter for the current frame.
 // This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
 // calls this function directly.
-void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
+void vp9_loop_filter_frame_init(struct VP9Common *const cm,
+                                struct macroblockd *const xd,
+                                int default_filt_lvl);

 void vp9_loop_filter_frame(struct VP9Common *cm,
                           struct macroblockd *mbd,
                           int filter_level,
-                           int y_only, int partial);
+                           int y_only);

 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -1,4 +1,3 @@
-
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
@@ -37,7 +36,7 @@ static const int mode_2_counter[MB_MODE_COUNT] = {
  9,  // D135_PRED
  9,  // D117_PRED
  9,  // D153_PRED
-  9,  // D207_PRED
+  9,  // D27_PRED
  9,  // D63_PRED
  9,  // TM_PRED
  0,  // NEARESTMV
@@ -71,33 +70,33 @@ static const int counter_to_context[19] = {
  BOTH_INTRA  // 18
 };

-static const MV mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
  // 4X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
  // 4X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
  // 8X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
  // 8X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
  // 8X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // 16X8
  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X8
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
  // 16X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
  // 16X32
-  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X16
  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X16
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
  // 32X32
-  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
  // 32X64
-  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // 64X32
  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X32
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
  // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+  {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
 };

 static const int idx_n_column_to_subblock[4][2] = {
@@ -122,75 +121,78 @@ static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
 static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
                                      int check_sub_blocks, int which_mv,
                                      int search_col, int block_idx) {
-  return check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
+  return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv];
+          : candidate->mbmi.mv[which_mv]);
 }


 // Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv,
                              const MV_REFERENCE_FRAME this_ref_frame,
                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
+  int_mv return_mv = candidate->mbmi.mv[which_mv];
+
+  // Sign inversion where appropriate.
+  if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] !=
+      ref_sign_bias[this_ref_frame]) {
+    return_mv.as_mv.row *= -1;
+    return_mv.as_mv.col *= -1;
  }
-  return mv;
+  return return_mv;
 }

 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
 // skip all additional processing and jump to done!
 #define ADD_MV_REF_LIST(MV) \
-  do { \
-    if (refmv_count) { \
-      if ((MV).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (MV); \
-        goto Done; \
-      } \
-    } else { \
-      mv_ref_list[refmv_count++] = (MV); \
+  if (refmv_count) { \
+    if ((MV).as_int != mv_ref_list[0].as_int) { \
+      mv_ref_list[refmv_count] = (MV); \
+      goto Done; \
    } \
-  } while (0)
+  } else { \
+    mv_ref_list[refmv_count++] = (MV); \
+  }

 // If either reference frame is different, not INTRA, and they
 // are different from each other scale and add the mv to our list.
 #define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
-  do { \
-    if ((CANDIDATE)->ref_frame[0] != ref_frame) \
-      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
-    if ((CANDIDATE)->ref_frame[1] != ref_frame && \
-        has_second_ref(CANDIDATE) && \
-        (CANDIDATE)->mv[1].as_int != (CANDIDATE)->mv[0].as_int) \
-      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
-  } while (0)
-
+  if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \
+    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
+  } \
+  if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \
+      (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \
+      (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \
+    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
+  }

 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
-static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row,
-                            const MV *mv) {
-  return !(mi_row + mv->row < 0 ||
-           mi_col + mv->col < cm->cur_tile_mi_col_start ||
-           mi_row + mv->row >= cm->mi_rows ||
-           mi_col + mv->col >= cm->cur_tile_mi_col_end);
+static INLINE int is_inside(int mi_col, int mi_row, int cur_tile_mi_col_start,
+                            const int mv_ref[2]) {
+  // Check that the candidate is within the border.  We only need to check
+  // the left side because all the positive right side ones are for blocks that
+  // are large enough to support the + value they have within their border.
+  return !(mi_row + mv_ref[1] < 0 ||
+           mi_col + mv_ref[0] < cur_tile_mi_col_start);
 }

 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col) {
-  const int *ref_sign_bias = cm->ref_frame_sign_bias;
-  int i, refmv_count = 0;
-  const MV *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
+                          const MODE_INFO *lf_here,
+                          const MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list, const int *ref_sign_bias,
+                          const int block_idx,
+                          const int mi_row, const int mi_col) {
+  int idx;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  int refmv_count = 0;
+  const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
+  const MODE_INFO *candidate;
+  const int check_sub_blocks = block_idx >= 0;
  int different_ref_found = 0;
  int context_counter = 0;

@@ -200,27 +202,28 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
  // The nearest 2 blocks are treated differently
  // if the size < 8x8 we get the mv from the bmi substructure,
  // and we also need to keep a mode count.
-  for (i = 0; i < 2; ++i) {
-    const MV *const mv_ref = &mv_ref_search[i];
-    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
-      const int check_sub_blocks = block_idx >= 0;
-      const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
-                                                   * xd->mode_info_stride];
-      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
-      // Keep counts for entropy encoding.
-      context_counter += mode_2_counter[candidate->mode];
+  for (idx = 0; idx < 2; ++idx) {
+    const int *mv_ref = mv_ref_search[idx];

-      // Check if the candidate comes from the same reference frame.
-      if (candidate->ref_frame[0] == ref_frame) {
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 0,
-                                         mv_ref->col, block_idx));
-        different_ref_found = candidate->ref_frame[1] != ref_frame;
-      } else {
-        if (candidate->ref_frame[1] == ref_frame)
-          // Add second motion vector if it has the same ref_frame.
-          ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 1,
-                                           mv_ref->col, block_idx));
-        different_ref_found = 1;
+    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
+      continue;
+
+    candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
+
+    // Keep counts for entropy encoding.
+    context_counter += mode_2_counter[candidate->mbmi.mode];
+
+    // Check if the candidate comes from the same reference frame.
+    if (candidate->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0,
+                                       mv_ref[0], block_idx));
+      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+    } else {
+      different_ref_found = 1;
+      if (candidate->mbmi.ref_frame[1] == ref_frame) {
+        // Add second motion vector if it has the same ref_frame.
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1,
+                                         mv_ref[0], block_idx));
      }
    }
  }
@@ -228,59 +231,65 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
  // Check the rest of the neighbors in much the same way
  // as before except we don't need to keep track of sub blocks or
  // mode counts.
-  for (; i < MVREF_NEIGHBOURS; ++i) {
-    const MV *const mv_ref = &mv_ref_search[i];
-    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
-      const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
-                                            mv_ref->row
-                                            * xd->mode_info_stride]->mbmi;
+  for (; idx < MVREF_NEIGHBOURS; ++idx) {
+    const int *mv_ref = mv_ref_search[idx];
+    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
+      continue;

-      if (candidate->ref_frame[0] == ref_frame) {
-        ADD_MV_REF_LIST(candidate->mv[0]);
-        different_ref_found = candidate->ref_frame[1] != ref_frame;
-      } else {
-        if (candidate->ref_frame[1] == ref_frame)
-          ADD_MV_REF_LIST(candidate->mv[1]);
-        different_ref_found = 1;
+    candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
+
+    if (candidate->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(candidate->mbmi.mv[0]);
+      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+    } else {
+      different_ref_found = 1;
+      if (candidate->mbmi.ref_frame[1] == ref_frame) {
+        ADD_MV_REF_LIST(candidate->mbmi.mv[1]);
      }
    }
  }

  // Check the last frame's mode and mv info.
-  if (prev_mbmi) {
-    if (prev_mbmi->ref_frame[0] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[0]);
-    else if (prev_mbmi->ref_frame[1] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[1]);
+  if (lf_here != NULL) {
+    if (lf_here->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(lf_here->mbmi.mv[0]);
+    } else if (lf_here->mbmi.ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(lf_here->mbmi.mv[1]);
+    }
  }

  // Since we couldn't find 2 mvs from the same reference frame
  // go back through the neighbors and find motion vectors from
  // different reference frames.
  if (different_ref_found) {
-    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-      const MV *mv_ref = &mv_ref_search[i];
-      if (is_inside(cm, mi_col, mi_row, mv_ref)) {
-        const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
-                                                          mv_ref->row
-                                              * xd->mode_info_stride]->mbmi;
+    for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) {
+      const int *mv_ref = mv_ref_search[idx];
+      if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
+        continue;

-        // If the candidate is INTRA we don't want to consider its mv.
-        if (is_inter_block(candidate))
-          IF_DIFF_REF_FRAME_ADD_MV(candidate);
-      }
+      candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
+
+      // If the candidate is INTRA we don't want to consider its mv.
+      if (!is_inter_block(&candidate->mbmi))
+        continue;
+
+      IF_DIFF_REF_FRAME_ADD_MV(candidate);
    }
  }

  // Since we still don't have a candidate we'll try the last frame.
-  if (prev_mbmi && is_inter_block(prev_mbmi))
-    IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi);
+  if (lf_here != NULL && is_inter_block(&lf_here->mbmi)) {
+    IF_DIFF_REF_FRAME_ADD_MV(lf_here);
+  }

 Done:

-  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+  mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter];

  // Clamp vectors
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
-    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
+    clamp_mv_ref(&mv_ref_list[idx].as_mv, xd);
 }
+
+#undef ADD_MV_REF_LIST
+#undef IF_DIFF_REF_FRAME_ADD_MV
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -14,20 +14,27 @@
 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
 #define VP9_COMMON_VP9_MVREF_COMMON_H_

-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
+void vp9_find_mv_refs_idx(VP9_COMMON *cm,
+                          MACROBLOCKD *xd,
+                          MODE_INFO *here,
+                          const MODE_INFO *lf_here,
+                          const MV_REFERENCE_FRAME ref_frame,
                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col);
+                          const int *ref_sign_bias,
+                          const int block_idx,
+                          const int mi_row,
+                          const int mi_col);

-static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
+static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
+                                    MACROBLOCKD *xd,
+                                    MODE_INFO *here,
+                                    MODE_INFO *lf_here,
                                    MV_REFERENCE_FRAME ref_frame,
                                    int_mv *mv_ref_list,
+                                    int *ref_sign_bias,
                                    int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame,
-                       mv_ref_list, -1, mi_row, mi_col);
+  vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
+                       mv_ref_list, ref_sign_bias, -1, mi_row, mi_col);
 }

 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -46,8 +46,7 @@ extern "C"
  typedef enum {
    USAGE_STREAM_FROM_SERVER    = 0x0,
    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-    USAGE_CONSTRAINED_QUALITY   = 0x2,
-    USAGE_CONSTANT_QUALITY      = 0x3,
+    USAGE_CONSTRAINED_QUALITY   = 0x2
  } END_USAGE;


@@ -131,8 +130,6 @@ extern "C"
    // END DATARATE CONTROL OPTIONS
    // ----------------------------------------------------------------

-    // Spatial scalability
-    int ss_number_layers;

    // these parameters aren't to be used in final build don't use!!!
    int play_alternate;
@@ -213,13 +210,6 @@ extern "C"
  int vp9_set_internal_size(VP9_PTR comp,
                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);

-  int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
-                           unsigned int height);
-
-  int vp9_switch_layer(VP9_PTR comp, int layer);
-
-  void vp9_set_svc(VP9_PTR comp, int use_svc);
-
  int vp9_get_quantizer(VP9_PTR c);

 #ifdef __cplusplus
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -20,7 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"

-#if CONFIG_VP9_POSTPROC
+#if CONFIG_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif

@@ -38,14 +38,14 @@
 #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)

 typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
  vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
                         [PARTITION_TYPES - 1];
  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
-                                 [SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+                                 [VP9_SWITCHABLE_FILTERS - 1];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
  vp9_prob single_ref_prob[REF_CONTEXTS][2];
@@ -53,18 +53,30 @@ typedef struct frame_contexts {
  struct tx_probs tx_probs;
  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
  nmv_context nmvc;
+#if CONFIG_INTERINTRA
+  vp9_prob interintra_prob[BLOCK_SIZE_TYPES];
+#if CONFIG_MASKED_INTERINTRA
+  vp9_prob masked_interintra_prob[BLOCK_SIZE_TYPES];
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  vp9_prob filterintra_prob[TX_SIZES][VP9_INTRA_MODES];
+#endif
+#if CONFIG_MASKED_INTERINTER
+  vp9_prob masked_compound_prob[BLOCK_SIZE_TYPES];
+#endif
 } FRAME_CONTEXT;

 typedef struct {
-  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
-  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
+  unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
  unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
  vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
  unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                         [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
-                                [SWITCHABLE_FILTERS];
-  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+  unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
+                                [VP9_SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES];
  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
  unsigned int single_ref[REF_CONTEXTS][2][2];
@@ -72,6 +84,18 @@ typedef struct {
  struct tx_counts tx;
  unsigned int mbskip[MBSKIP_CONTEXTS][2];
  nmv_context_counts mv;
+#if CONFIG_INTERINTRA
+  unsigned int interintra[BLOCK_SIZE_TYPES][2];
+#if CONFIG_MASKED_INTERINTRA
+  unsigned int masked_interintra[BLOCK_SIZE_TYPES][2];
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  unsigned int filterintra[TX_SIZES][VP9_INTRA_MODES][2];
+#endif
+#if CONFIG_MASKED_INTERINTER
+  unsigned int masked_compound[BLOCK_SIZE_TYPES][2];
+#endif
 } FRAME_COUNTS;


@@ -164,10 +188,6 @@ typedef struct VP9Common {
  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */

-  MODE_INFO **mi_grid_base;
-  MODE_INFO **mi_grid_visible;
-  MODE_INFO **prev_mi_grid_base;
-  MODE_INFO **prev_mi_grid_visible;

  // Persistent mb segment id map used in prediction.
  unsigned char *last_frame_seg_map;
@@ -180,9 +200,6 @@ typedef struct VP9Common {

  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */

-  struct loopfilter lf;
-  struct segmentation seg;
-
  /* Y,U,V */
  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
@@ -205,10 +222,21 @@ typedef struct VP9Common {
  unsigned int current_video_frame;
  int version;

-#if CONFIG_VP9_POSTPROC
+#if CONFIG_POSTPROC
  struct postproc_state  postproc_state;
 #endif

+#if CONFIG_INTERINTRA
+  int use_interintra;
+#if CONFIG_MASKED_INTERINTRA
+  int use_masked_interintra;
+#endif
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+  int use_masked_compound;
+#endif
+
  int error_resilient_mode;
  int frame_parallel_decoding_mode;

@@ -238,19 +266,7 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
 }

 static int mi_cols_aligned_to_sb(int n_mis) {
-  return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
-}
-
-static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col) {
-  const int above_idx = mi_col * 2;
-  const int left_idx = (mi_row * 2) & 15;
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x);
-    pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y);
-  }
+  return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE);
 }

 static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -259,20 +275,25 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
  xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
 }

-// return the node index in the prob tree for binary coding
-static int check_bsize_coverage(int bs, int mi_rows, int mi_cols,
-                                int mi_row, int mi_col) {
-  const int r = (mi_row + bs < mi_rows);
-  const int c = (mi_col + bs < mi_cols);
+static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE_TYPE bsize) {
+  int bsl = mi_width_log2(bsize), bs = 1 << bsl;
+  int ms = bs / 2;

-  if (r && c)
+  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms < cm->mi_cols))
    return 0;

-  if (c && !r)
-    return 1;  // only allow horizontal/split partition types
+  // frame width/height are multiples of 8, hence 8x8 block should always
+  // pass the above check
+  assert(bsize > BLOCK_8X8);

-  if (r && !c)
-    return 2;  // only allow vertical/split partition types
+  // return the node index in the prob tree for binary coding
+  // only allow horizontal/split partition types
+  if ((mi_col + ms < cm->mi_cols) && (mi_row + ms >= cm->mi_rows))
+    return 1;
+  // only allow vertical/split partition types
+  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms >= cm->mi_cols))
+    return 2;

  return -1;
 }
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -53,7 +53,7 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
  { RGB_TO_YUV(0xCC33FF) },   /* Magenta */
 };

-static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = {
+static const unsigned char B_PREDICTION_MODE_colors[VP9_INTRA_MODES][3] = {
  { RGB_TO_YUV(0x6633ff) },   /* Purple */
  { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
  { RGB_TO_YUV(0xff33cc) },   /* Pink */
@@ -630,21 +630,23 @@ static void constrain_line(int x0, int *x1, int y0, int *y1,
  }
 }

-int vp9_post_proc_frame(struct VP9Common *cm,
-                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  int q = cm->lf.filter_level * 10 / 6;
+int vp9_post_proc_frame(struct VP9Common *oci,
+                        struct loopfilter *lf,
+                        YV12_BUFFER_CONFIG *dest,
+                        vp9_ppflags_t *ppflags) {
+  int q = lf->filter_level * 10 / 6;
  int flags = ppflags->post_proc_flag;
  int deblock_level = ppflags->deblocking_level;
  int noise_level = ppflags->noise_level;

-  if (!cm->frame_to_show)
+  if (!oci->frame_to_show)
    return -1;

  if (q > 63)
    q = 63;

  if (!flags) {
-    *dest = *cm->frame_to_show;
+    *dest = *oci->frame_to_show;
    return 0;
  }

@@ -653,52 +655,52 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 #endif

  if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
+    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
                               q + (deblock_level - 5) * 10, 1, 0);
  } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
+    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
  } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
+    vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
  }

  if (flags & VP9D_ADDNOISE) {
-    if (cm->postproc_state.last_q != q
-        || cm->postproc_state.last_noise != noise_level) {
-      fillrd(&cm->postproc_state, 63 - q, noise_level);
+    if (oci->postproc_state.last_q != q
+        || oci->postproc_state.last_noise != noise_level) {
+      fillrd(&oci->postproc_state, 63 - q, noise_level);
    }

-    vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
-                        cm->postproc_state.noise,
-                        cm->postproc_state.blackclamp,
-                        cm->postproc_state.whiteclamp,
-                        cm->postproc_state.bothclamp,
-                        cm->post_proc_buffer.y_width,
-                        cm->post_proc_buffer.y_height,
-                        cm->post_proc_buffer.y_stride);
+    vp9_plane_add_noise(oci->post_proc_buffer.y_buffer,
+                        oci->postproc_state.noise,
+                        oci->postproc_state.blackclamp,
+                        oci->postproc_state.whiteclamp,
+                        oci->postproc_state.bothclamp,
+                        oci->post_proc_buffer.y_width,
+                        oci->post_proc_buffer.y_height,
+                        oci->post_proc_buffer.y_stride);
  }

 #if 0 && CONFIG_POSTPROC_VISUALIZER
  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
    char message[512];
    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (cm->frame_type == KEY_FRAME),
-            cm->refresh_golden_frame,
-            cm->base_qindex,
-            cm->filter_level,
+            (oci->frame_type == KEY_FRAME),
+            oci->refresh_golden_frame,
+            oci->base_qindex,
+            oci->filter_level,
            flags,
-            cm->mb_cols, cm->mb_rows);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
+            oci->mb_cols, oci->mb_rows);
+    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
+                  oci->post_proc_buffer.y_stride);
  }

  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
    int i, j;
    uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int mb_rows = post->y_height >> 4;
    int mb_cols = post->y_width  >> 4;
    int mb_index = 0;
-    MODE_INFO *mi = cm->mi;
+    MODE_INFO *mi = oci->mi;

    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

@@ -723,11 +725,11 @@ int vp9_post_proc_frame(struct VP9Common *cm,
  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
    int i, j;
    uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int mb_rows = post->y_height >> 4;
    int mb_cols = post->y_width  >> 4;
    int mb_index = 0;
-    MODE_INFO *mi = cm->mi;
+    MODE_INFO *mi = oci->mi;

    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

@@ -737,9 +739,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
        char zz[4];
        int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
                        mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.skip_coeff);
+                        mi[mb_index].mbmi.mb_skip_coeff);

-        if (cm->frame_type == KEY_FRAME)
+        if (oci->frame_type == KEY_FRAME)
          sprintf(zz, "a");
        else
          sprintf(zz, "%c", dc_diff + '0');
@@ -759,19 +761,19 @@ int vp9_post_proc_frame(struct VP9Common *cm,
    char message[512];
    snprintf(message, sizeof(message),
             "Bitrate: %10.2f framerate: %10.2f ",
-             cm->bitrate, cm->framerate);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
+             oci->bitrate, oci->framerate);
+    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
+                  oci->post_proc_buffer.y_stride);
  }

  /* Draw motion vectors */
  if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
-    MODE_INFO *mi = cm->mi;
+    uint8_t *y_buffer = oci->post_proc_buffer.y_buffer;
+    int y_stride = oci->post_proc_buffer.y_stride;
+    MODE_INFO *mi = oci->mi;
    int x0, y0;

    for (y0 = 0; y0 < height; y0 += 16) {
@@ -880,7 +882,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
              }
            }
          }
-        } else if (is_inter_mode(mi->mbmi.mode)) {
+        } else if (mi->mbmi.mode >= NEARESTMV) {
          MV *mv = &mi->mbmi.mv.as_mv;
          const int lx0 = x0 + 8;
          const int ly0 = y0 + 8;
@@ -908,14 +910,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
  if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
      && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
    int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
-    MODE_INFO *mi = cm->mi;
+    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
+    int y_stride = oci->post_proc_buffer.y_stride;
+    MODE_INFO *mi = oci->mi;

    for (y = 0; y < height; y += 16) {
      for (x = 0; x < width; x += 16) {
@@ -973,14 +975,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
  if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
      ppflags->display_ref_frame_flag) {
    int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
-    MODE_INFO *mi = cm->mi;
+    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
+    int y_stride = oci->post_proc_buffer.y_stride;
+    MODE_INFO *mi = oci->mi;

    for (y = 0; y < height; y += 16) {
      for (x = 0; x < width; x += 16) {
@@ -1006,13 +1008,12 @@ int vp9_post_proc_frame(struct VP9Common *cm,
  }
 #endif

-  *dest = cm->post_proc_buffer;
+  *dest = oci->post_proc_buffer;

  /* handle problem with extending borders */
-  dest->y_width = cm->width;
-  dest->y_height = cm->height;
-  dest->uv_width = dest->y_width >> cm->subsampling_x;
-  dest->uv_height = dest->y_height >> cm->subsampling_y;
+  dest->y_width = oci->width;
+  dest->y_height = oci->height;
+  dest->uv_height = dest->y_height / 2;

  return 0;
 }
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -26,7 +26,7 @@ struct postproc_state {
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"

-int vp9_post_proc_frame(struct VP9Common *cm,
+int vp9_post_proc_frame(struct VP9Common *oci, struct loopfilter *lf,
                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);

 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -18,49 +18,48 @@

 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  // left
-  const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode)
-                                         : 0;
+  const int left_mv_pred = is_inter_mode(left_mbmi->mode);
  const int left_interp = left_in_image && left_mv_pred
-                              ? left_mi->mbmi.interp_filter
-                              : SWITCHABLE_FILTERS;
+                              ? left_mbmi->interp_filter
+                              : VP9_SWITCHABLE_FILTERS;

  // above
-  const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode)
-                                           : 0;
+  const int above_mv_pred = is_inter_mode(above_mbmi->mode);
  const int above_interp = above_in_image && above_mv_pred
-                               ? above_mi->mbmi.interp_filter
-                               : SWITCHABLE_FILTERS;
+                               ? above_mbmi->interp_filter
+                               : VP9_SWITCHABLE_FILTERS;
+

  if (left_interp == above_interp)
    return left_interp;
-  else if (left_interp == SWITCHABLE_FILTERS &&
-           above_interp != SWITCHABLE_FILTERS)
+  else if (left_interp == VP9_SWITCHABLE_FILTERS &&
+           above_interp != VP9_SWITCHABLE_FILTERS)
    return above_interp;
-  else if (left_interp != SWITCHABLE_FILTERS &&
-           above_interp == SWITCHABLE_FILTERS)
+  else if (left_interp != VP9_SWITCHABLE_FILTERS &&
+           above_interp == VP9_SWITCHABLE_FILTERS)
    return left_interp;
  else
-    return SWITCHABLE_FILTERS;
+    return VP9_SWITCHABLE_FILTERS;
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const int left_intra = !is_inter_block(left_mbmi);
+  const int above_intra = !is_inter_block(above_mbmi);

  // The mode info data structure has a one element border above and to the
  // left of the entries corresponding to real macroblocks.
@@ -81,35 +80,35 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
 unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  if (above_in_image && left_in_image) {  // both edges available
-    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+    if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+        left_mbmi->ref_frame[1] <= INTRA_FRAME)
      // neither edge uses comp pred (0/1)
      pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
                     (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
-    else if (!has_second_ref(above_mbmi))
+    else if (above_mbmi->ref_frame[1] <= INTRA_FRAME)
      // one of two edges uses comp pred (2/3)
      pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          !is_inter_block(above_mbmi));
-    else if (!has_second_ref(left_mbmi))
+                          above_mbmi->ref_frame[0] == INTRA_FRAME);
+    else if (left_mbmi->ref_frame[1] <= INTRA_FRAME)
      // one of two edges uses comp pred (2/3)
      pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          !is_inter_block(left_mbmi));
+                          left_mbmi->ref_frame[0] == INTRA_FRAME);
    else  // both edges use comp pred (4)
      pred_context = 4;
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;

-    if (!has_second_ref(edge_mbmi))
+    if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
      // edge does not use comp pred (0/1)
      pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
    else
@@ -126,14 +125,11 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
 unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                              const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-cm->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
@@ -142,19 +138,22 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
  const int var_ref_idx = !fix_ref_idx;

  if (above_in_image && left_in_image) {  // both edges available
-    if (above_intra && left_intra) {  // intra/intra (2)
+    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+        left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/intra (2)
      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+               left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+                                          left_mbmi : above_mbmi;

-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)  // single pred (1/3)
        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
      else  // comp pred (1/3)
        pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
                                    != cm->comp_var_ref[1]);
    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
+      int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME;
+      int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME;
      MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
                                     : above_mbmi->ref_frame[var_ref_idx];
      MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
@@ -188,15 +187,13 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;

-    if (!is_inter_block(edge_mbmi)) {
+    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+    else if (edge_mbmi->ref_frame[1] > INTRA_FRAME)
+      pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
                              != cm->comp_var_ref[1]);
-      else
-        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
-    }
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
  } else {  // no edges available (2)
    pred_context = 2;
  }
@@ -206,91 +203,91 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 }
 unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  if (above_in_image && left_in_image) {  // both edges available
-    if (above_intra && left_intra) {  // intra/intra
+    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+        left_mbmi->ref_frame[0] == INTRA_FRAME) {
      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi))
+    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+               left_mbmi->ref_frame[0] == INTRA_FRAME) {
+      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+                                          left_mbmi : above_mbmi;
+
+      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
      else
        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-    } else {  // inter/inter
-      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
-        pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
-                       2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
-      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
-        pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                            above_mbmi->ref_frame[1] == LAST_FRAME ||
-                            left_mbmi->ref_frame[0] == LAST_FRAME ||
-                            left_mbmi->ref_frame[1] == LAST_FRAME);
-      } else {
-        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
+      pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
+                     2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
+               left_mbmi->ref_frame[1] > INTRA_FRAME) {
+      pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                          above_mbmi->ref_frame[1] == LAST_FRAME ||
+                          left_mbmi->ref_frame[0] == LAST_FRAME ||
+                          left_mbmi->ref_frame[1] == LAST_FRAME);
+    } else {
+      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];

-        if (rfs == LAST_FRAME)
-          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-        else
-          pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
-      }
+      if (rfs == LAST_FRAME)
+        pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      else
+        pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
    }
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
-    if (!is_inter_block(edge_mbmi)) {  // intra
+
+    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
      pred_context = 2;
-    } else {  // inter
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      else
-        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-    }
-  } else {  // no edges available
+    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+    else
+      pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST_FRAME);
+  } else {  // no edges available (2)
    pred_context = 2;
  }
-
  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  return pred_context;
 }

 unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
  int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;

  // Note:
  // The mode info data structure has a one element border above and to the
  // left of the entries correpsonding to real macroblocks.
  // The prediction flags in these dummy entries are initialised to 0.
  if (above_in_image && left_in_image) {  // both edges available
-    if (above_intra && left_intra) {  // intra/intra
+    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+        left_mbmi->ref_frame[0] == INTRA_FRAME) {
      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
+    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+               left_mbmi->ref_frame[0] == INTRA_FRAME) {
+      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+                                          left_mbmi : above_mbmi;
+
+      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) {
        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
          pred_context = 3;
        else
@@ -299,53 +296,54 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
                                edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
      }
-    } else {  // inter/inter
-      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
-        if (above_mbmi->ref_frame[0] == LAST_FRAME &&
-            left_mbmi->ref_frame[0] == LAST_FRAME) {
-          pred_context = 3;
-        } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                   left_mbmi->ref_frame[0] == LAST_FRAME) {
-          const MB_MODE_INFO *edge_mbmi =
-              above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi;
+    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
+      if (above_mbmi->ref_frame[0] == LAST_FRAME &&
+          left_mbmi->ref_frame[0] == LAST_FRAME) {
+        pred_context = 3;
+      } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                 left_mbmi->ref_frame[0] == LAST_FRAME) {
+        const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == LAST_FRAME ?
+                                           left_mbmi : above_mbmi;

-          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
-        } else {
-          pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
-                         2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
-        }
-      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
-        if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
-            above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
-          pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                              left_mbmi->ref_frame[1] == GOLDEN_FRAME);
-        else
-          pred_context = 2;
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
      } else {
-        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
-                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
-
-        if (rfs == GOLDEN_FRAME)
-          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-        else if (rfs == ALTREF_FRAME)
-          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
-        else
-          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
+                       2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
      }
+    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
+               left_mbmi->ref_frame[1] > INTRA_FRAME) {
+      if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
+          above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
+        pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                            above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
+                            left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                            left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      else
+        pred_context = 2;
+    } else {
+      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+
+      if (rfs == GOLDEN_FRAME)
+        pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      else if (rfs == ALTREF_FRAME)
+        pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+      else
+        pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
    }
  } else if (above_in_image || left_in_image) {  // one edge available
    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;

-    if (!is_inter_block(edge_mbmi) ||
-        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
+    if (edge_mbmi->ref_frame[0] == INTRA_FRAME ||
+        (edge_mbmi->ref_frame[0] == LAST_FRAME &&
+         edge_mbmi->ref_frame[1] <= INTRA_FRAME))
      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
    else
      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
@@ -361,23 +359,22 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type];
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const int max_tx_size = max_txsize_lookup[mi->mbmi.sb_type];
  int above_context = max_tx_size;
  int left_context = max_tx_size;

  if (above_in_image)
-    above_context = above_mbmi->skip_coeff ? max_tx_size
-                                           : above_mbmi->tx_size;
+    above_context = above_mbmi->mb_skip_coeff ? max_tx_size
+                                              : above_mbmi->txfm_size;

  if (left_in_image)
-    left_context = left_mbmi->skip_coeff ? max_tx_size
-                                         : left_mbmi->tx_size;
+    left_context = left_mbmi->mb_skip_coeff ? max_tx_size
+                                            : left_mbmi->txfm_size;

  if (!left_in_image)
    left_context = above_context;
@@ -388,17 +385,36 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
  return above_context + left_context > max_tx_size;
 }

-void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
-  xd->this_mi->mbmi.seg_id_predicted = pred_flag;
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag) {
+  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
+  const int bw = 1 << mi_width_log2(bsize);
+  const int bh = 1 << mi_height_log2(bsize);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++)
+      mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag;
 }

-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              uint8_t pred_flag) {
-  xd->this_mi->mbmi.skip_coeff = pred_flag;
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag) {
+  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
+  const int bw = 1 << mi_width_log2(bsize);
+  const int bh = 1 << mi_height_log2(bsize);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++)
+      mi[y * cm->mode_info_stride + x].mbmi.mb_skip_coeff = pred_flag;
 }

 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE bsize, int mi_row, int mi_col) {
+                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
  const int mi_offset = mi_row * cm->mi_cols + mi_col;
  const int bw = 1 << mi_width_log2(bsize);
  const int bh = 1 << mi_height_log2(bsize);
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -15,32 +15,32 @@
 #include "vp9/common/vp9_onyxc_int.h"

 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE bsize, int mi_row, int mi_col);
+                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);


 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const int above_sip = above_mi ? above_mi->mbmi.seg_id_predicted : 0;
-  const int left_sip = left_mi ? left_mi->mbmi.seg_id_predicted : 0;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;

-  return above_sip + (xd->left_available ? left_sip : 0);
+  return above_mbmi->seg_id_predicted +
+             (xd->left_available ? left_mbmi->seg_id_predicted : 0);
 }

-static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
-                                                const MACROBLOCKD *xd) {
-  return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(const MACROBLOCKD *xd) {
+  return xd->seg.pred_probs[vp9_get_pred_context_seg_id(xd)];
 }

-void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag);
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag);

 static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const int above_skip_coeff = above_mi ? above_mi->mbmi.skip_coeff : 0;
-  const int left_skip_coeff = left_mi ? left_mi->mbmi.skip_coeff : 0;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;

-  return above_skip_coeff + (xd->left_available ? left_skip_coeff : 0);
+  return above_mbmi->mb_skip_coeff +
+             (xd->left_available ? left_mbmi->mb_skip_coeff : 0);
 }

 static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
@@ -49,11 +49,11 @@ static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
 }

 static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
-  return xd->this_mi->mbmi.skip_coeff;
+  return xd->mode_info_context->mbmi.mb_skip_coeff;
 }

-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              uint8_t pred_flag);
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag);

 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);

@@ -102,7 +102,7 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,

 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);

-static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
                                    const struct tx_probs *tx_probs) {
  if (bsize < BLOCK_16X16)
    return tx_probs->p8x8[context];
@@ -113,14 +113,13 @@ static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
 }

 static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs,
-                                     const MODE_INFO *m) {
-  const BLOCK_SIZE bsize = m->mbmi.sb_type;
+                                     const struct tx_probs *tx_probs) {
+  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
  const int context = vp9_get_pred_context_tx_size(xd);
  return get_tx_probs(bsize, context, tx_probs);
 }

-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
                             TX_SIZE tx_size, struct tx_counts *tx_counts) {
  if (bsize >= BLOCK_32X32)
    tx_counts->p32x32[context][tx_size]++;
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -130,12 +130,12 @@ int16_t vp9_ac_quant(int qindex, int delta) {
 }


-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) {
-  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
-    return seg->abs_delta == SEGMENT_ABSDATA ?
-                             data :  // Abs value
-                             clamp(base_qindex + data, 0, MAXQ);  // Delta value
+int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {
+  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_ALT_Q);
+    return xd->seg.abs_delta == SEGMENT_ABSDATA ?
+               data :  // Abs value
+               clamp(base_qindex + data, 0, MAXQ);  // Delta value
  } else {
    return base_qindex;
  }
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -23,6 +23,6 @@ void vp9_init_quant_tables();
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);

-int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex);
+int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex);

 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -10,27 +10,171 @@

 #include <assert.h>

-#include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
-
 #include "vpx/vpx_integer.h"
-
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "./vpx_scale_rtcd.h"

+static int scale_value_x_with_scaling(int val,
+                                      const struct scale_factors *scale) {
+  return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
+}
+
+static int scale_value_y_with_scaling(int val,
+                                      const struct scale_factors *scale) {
+  return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
+}
+
+static int unscaled_value(int val, const struct scale_factors *scale) {
+  (void) scale;
+  return val;
+}
+
+static MV32 mv_q3_to_q4_with_scaling(const MV *mv,
+                                     const struct scale_factors *scale) {
+  const MV32 res = {
+    ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
+        + scale->y_offset_q4,
+    ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
+        + scale->x_offset_q4
+  };
+  return res;
+}
+
+static MV32 mv_q3_to_q4_without_scaling(const MV *mv,
+                                        const struct scale_factors *scale) {
+  const MV32 res = {
+     mv->row << 1,
+     mv->col << 1
+  };
+  return res;
+}
+
+static MV32 mv_q4_with_scaling(const MV *mv,
+                               const struct scale_factors *scale) {
+  const MV32 res = {
+    (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
+    (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
+  };
+  return res;
+}
+
+static MV32 mv_q4_without_scaling(const MV *mv,
+                                  const struct scale_factors *scale) {
+  const MV32 res = {
+    mv->row,
+    mv->col
+  };
+  return res;
+}
+
+static void set_offsets_with_scaling(struct scale_factors *scale,
+                                     int row, int col) {
+  const int x_q4 = 16 * col;
+  const int y_q4 = 16 * row;
+
+  scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
+  scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
+}
+
+static void set_offsets_without_scaling(struct scale_factors *scale,
+                                        int row, int col) {
+  scale->x_offset_q4 = 0;
+  scale->y_offset_q4 = 0;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return (other_size << VP9_REF_SCALE_SHIFT) / this_size;
+}
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h) {
+  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  scale->x_offset_q4 = 0;  // calculated per-mb
+  scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
+
+  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  scale->y_offset_q4 = 0;  // calculated per-mb
+  scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
+
+  if ((other_w == this_w) && (other_h == this_h)) {
+    scale->scale_value_x = unscaled_value;
+    scale->scale_value_y = unscaled_value;
+    scale->set_scaled_offsets = set_offsets_without_scaling;
+    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;
+    scale->scale_mv_q4 = mv_q4_without_scaling;
+  } else {
+    scale->scale_value_x = scale_value_x_with_scaling;
+    scale->scale_value_y = scale_value_y_with_scaling;
+    scale->set_scaled_offsets = set_offsets_with_scaling;
+    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;
+    scale->scale_mv_q4 = mv_q4_with_scaling;
+  }
+
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+  if (scale->x_step_q4 == 16) {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in either direction.
+      scale->predict[0][0][0] = vp9_convolve_copy;
+      scale->predict[0][0][1] = vp9_convolve_avg;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      scale->predict[0][0][0] = vp9_convolve8_vert;
+      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  } else {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      scale->predict[0][0][0] = vp9_convolve8_horiz;
+      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      scale->predict[0][0][0] = vp9_convolve8;
+      scale->predict[0][0][1] = vp9_convolve8_avg;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  }
+  // 2D subpel motion always gets filtered in both directions
+  scale->predict[1][1][0] = vp9_convolve8;
+  scale->predict[1][1][1] = vp9_convolve8_avg;
+}

 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
                              VP9_COMMON *cm) {
-  if (xd->mi_8x8 && xd->this_mi) {
-    MB_MODE_INFO * mbmi = &xd->this_mi->mbmi;
+  if (xd->mode_info_context) {
+    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

    set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
                      cm->active_ref_scale);
-  } else {
-    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
  }

  switch (mcomp_filter_type) {
@@ -55,18 +199,17 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const MV *src_mv,
                               const struct scale_factors *scale,
-                               int w, int h, int ref,
+                               int w, int h, int weight,
                               const struct subpix_fn_table *subpix,
                               enum mv_precision precision) {
-  const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row << 1,
-                     is_q4 ? src_mv->col : src_mv->col << 1 };
-  const MV32 mv = scale->scale_mv(&mv_q4, scale);
-  const int subpel_x = mv.col & SUBPEL_MASK;
-  const int subpel_y = mv.row & SUBPEL_MASK;
+  const MV32 mv = precision == MV_PRECISION_Q4
+                     ? scale->scale_mv_q4(src_mv, scale)
+                     : scale->scale_mv_q3_to_q4(src_mv, scale);
+  const int subpel_x = mv.col & 15;
+  const int subpel_y = mv.row & 15;

-  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
-  scale->predict[subpel_x != 0][subpel_y != 0][ref](
+  src += (mv.row >> 4) * src_stride + (mv.col >> 4);
+  scale->predict[!!subpel_x][!!subpel_y][weight](
      src, src_stride, dst, dst_stride,
      subpix->filter_x[subpel_x], scale->x_step_q4,
      subpix->filter_y[subpel_y], scale->y_step_q4,
@@ -89,16 +232,20 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
  return res;
 }

+
+
 // TODO(jkoleszar): yet another mv clamping function :-(
-MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
-                             int bw, int bh, int ss_x, int ss_y) {
+MV clamp_mv_to_umv_border_sb(const MV *src_mv,
+    int bwl, int bhl, int ss_x, int ss_y,
+    int mb_to_left_edge, int mb_to_top_edge,
+    int mb_to_right_edge, int mb_to_bottom_edge) {
  // If the MV points so far into the UMV border that no visible pixels
  // are used for reconstruction, the subpel part of the MV can be
  // discarded and the MV limited to 16 pixels with equivalent results.
-  const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS;
-  const int spel_right = spel_left - SUBPEL_SHIFTS;
-  const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
-  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+  const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
+  const int spel_right = spel_left - (1 << 4);
+  const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
+  const int spel_bottom = spel_top - (1 << 4);
  MV clamped_mv = {
    src_mv->row << (1 - ss_y),
    src_mv->col << (1 - ss_x)
@@ -106,143 +253,461 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
  assert(ss_x <= 1);
  assert(ss_y <= 1);

-  clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left,
-                        (xd->mb_to_right_edge << (1 - ss_x)) + spel_right,
-                        (xd->mb_to_top_edge << (1 - ss_y)) - spel_top,
-                        (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+  clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left,
+                        (mb_to_right_edge << (1 - ss_x)) + spel_right,
+                        (mb_to_top_edge << (1 - ss_y)) - spel_top,
+                        (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);

  return clamped_mv;
 }

-struct build_inter_predictors_args {
-  MACROBLOCKD *xd;
-  int x, y;
+#if CONFIG_MASKED_INTERINTER
+#define MASK_WEIGHT_BITS 6
+
+static int get_masked_weight(int m) {
+  #define SMOOTHER_LEN  32
+  static const uint8_t smoothfn[2 * SMOOTHER_LEN + 1] = {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  1,  1,  1,
+      1,  1,  2,  2,  3,  4,  5,  6,
+      8,  9, 12, 14, 17, 21, 24, 28,
+      32,
+      36, 40, 43, 47, 50, 52, 55, 56,
+      58, 59, 60, 61, 62, 62, 63, 63,
+      63, 63, 63, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+  };
+  if (m < -SMOOTHER_LEN)
+    return 0;
+  else if (m > SMOOTHER_LEN)
+    return (1 << MASK_WEIGHT_BITS);
+  else
+    return smoothfn[m + SMOOTHER_LEN];
+}
+
+static int get_hard_mask(int m) {
+  return m > 0;
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
+// The soft mask is obtained by computing f(x, y) and then calling
+// get_masked_weight(f(x, y)).
+static const int mask_params_sml[1 << MASK_BITS_SML][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
 };

-static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
+static const int mask_params_med_hgtw[1 << MASK_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+};
+
+static const int mask_params_med_hltw[1 << MASK_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+};
+
+static const int mask_params_med_heqw[1 << MASK_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_hgtw[1 << MASK_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+};
+
+static const int mask_params_big_hltw[1 << MASK_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_heqw[1 << MASK_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int *get_mask_params(int mask_index,
+                                  BLOCK_SIZE_TYPE sb_type,
+                                  int h, int w) {
+  const int *a;
+  const int mask_bits = get_mask_bits(sb_type);
+
+  if (mask_index == MASK_NONE)
+    return NULL;
+
+  if (mask_bits == MASK_BITS_SML) {
+    a = mask_params_sml[mask_index];
+  } else if (mask_bits == MASK_BITS_MED) {
+    if (h > w)
+      a = mask_params_med_hgtw[mask_index];
+    else if (h < w)
+      a = mask_params_med_hltw[mask_index];
+    else
+      a = mask_params_med_heqw[mask_index];
+  } else if (mask_bits == MASK_BITS_BIG) {
+    if (h > w)
+      a = mask_params_big_hgtw[mask_index];
+    else if (h < w)
+      a = mask_params_big_hltw[mask_index];
+    else
+      a = mask_params_big_heqw[mask_index];
+  } else {
+    assert(0);
+  }
+  return a;
+}
+
+void vp9_generate_masked_weight(int mask_index,
+                                BLOCK_SIZE_TYPE sb_type,
+                                int h, int w,
+                                uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight(m);
+    }
+}
+
+void vp9_generate_hard_mask(int mask_index, BLOCK_SIZE_TYPE sb_type,
+                            int h, int w, uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_hard_mask(m);
+    }
+}
+
+static void build_masked_compound(uint8_t *dst, int dst_stride,
+                                  uint8_t *dst2, int dst2_stride,
+                                  int mask_index, BLOCK_SIZE_TYPE sb_type,
+                                  int h, int w) {
+  int i, j;
+  uint8_t mask[4096];
+  vp9_generate_masked_weight(mask_index, sb_type, h, w, mask, 64);
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int m = mask[i * 64 + j];
+      dst[i * dst_stride + j] =  (dst[i * dst_stride + j] * m +
+                                  dst2[i * dst2_stride + j] *
+                                  ((1 << MASK_WEIGHT_BITS) - m) +
+                                  (1 << (MASK_WEIGHT_BITS - 1))) >>
+                                 MASK_WEIGHT_BITS;
+    }
+}
+#endif
+
+struct build_inter_predictors_args {
+  MACROBLOCKD *xd;
+  int x;
+  int y;
+  uint8_t* dst[MAX_MB_PLANE];
+  int dst_stride[MAX_MB_PLANE];
+  uint8_t* pre[2][MAX_MB_PLANE];
+  int pre_stride[2][MAX_MB_PLANE];
+};
+static void build_inter_predictors(int plane, int block,
+                                   BLOCK_SIZE_TYPE bsize,
                                   int pred_w, int pred_h,
                                   void *argv) {
  const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD *const xd = arg->xd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
-  const int bw = 4 << bwl;
-  const int bh = plane_block_height(bsize, pd);
-  const int x = 4 * (block & ((1 << bwl) - 1));
-  const int y = 4 * (block >> bwl);
-  const MODE_INFO *mi = xd->this_mi;
+  MACROBLOCKD * const xd = arg->xd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
+  const MODE_INFO *const mi = xd->mode_info_context;
  const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
-  int ref;
+  int which_mv;

-  assert(x < bw);
-  assert(y < bh);
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
+  assert(x < (4 << bwl));
+  assert(y < (4 << bhl));
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == (4 << bwl));
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == (4 << bhl));

-  for (ref = 0; ref < 1 + use_second_ref; ++ref) {
-    struct scale_factors *const scale = &xd->scale_factor[ref];
-    struct buf_2d *const pre_buf = &pd->pre[ref];
-    struct buf_2d *const dst_buf = &pd->dst;
+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+    // source
+    const uint8_t * const base_pre = arg->pre[which_mv][plane];
+    const int pre_stride = arg->pre_stride[which_mv][plane];
+    const uint8_t *const pre = base_pre +
+        scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
+    struct scale_factors * const scale = &xd->scale_factor[which_mv];

-    const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
-                               pre_buf->stride, scale);
-
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    // dest
+    uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;

    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
    // same MV (the average of the 4 luma MVs) but we could do something
    // smarter for non-4:2:0. Just punt for now, pending the changes to get
    // rid of SPLITMV mode entirely.
    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
-                             : mi_mv_pred_q4(mi, ref))
-               : mi->mbmi.mv[ref].as_mv;
+               ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv
+                             : mi_mv_pred_q4(mi, which_mv))
+               : mi->mbmi.mv[which_mv].as_mv;

    // TODO(jkoleszar): This clamping is done in the incorrect place for the
    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
    // MV. Note however that it performs the subsampling aware scaling so
    // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                                pd->subsampling_x,
-                                                pd->subsampling_y);
-
+    const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl,
+                                                xd->plane[plane].subsampling_x,
+                                                xd->plane[plane].subsampling_y,
+                                                xd->mb_to_left_edge,
+                                                xd->mb_to_top_edge,
+                                                xd->mb_to_right_edge,
+                                                xd->mb_to_bottom_edge);
    scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-    vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                              &res_mv, scale,
-                              4 << pred_w, 4 << pred_h, ref,
+
+#if CONFIG_MASKED_INTERINTER
+    if (which_mv && xd->mode_info_context->mbmi.use_masked_compound) {
+      uint8_t tmp_dst[4096];
+      vp9_build_inter_predictor(pre, pre_stride,
+                                tmp_dst, 64,
+                                &res_mv, &xd->scale_factor[which_mv],
+                                4 << pred_w, 4 << pred_h, 0,
+                                &xd->subpix, MV_PRECISION_Q4);
+      build_masked_compound(dst, arg->dst_stride[plane],
+                            tmp_dst, 64,
+                            xd->mode_info_context->mbmi.mask_index,
+                            xd->mode_info_context->mbmi.sb_type,
+                            (4 << pred_h), (4 << pred_w));
+
+    } else {
+#endif
+    vp9_build_inter_predictor(pre, pre_stride,
+                              dst, arg->dst_stride[plane],
+                              &res_mv, &xd->scale_factor[which_mv],
+                              4 << pred_w, 4 << pred_h, which_mv,
                              &xd->subpix, MV_PRECISION_Q4);
-  }
-}
-
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int i, x, y;
-
-  // block sizes in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // subsampled size of the block
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  // size of the predictor to use.
-  int pred_w, pred_h;
-
-  if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) {
-    assert(bsize == BLOCK_8X8);
-    pred_w = 0;
-    pred_h = 0;
-  } else {
-    pred_w = bwl;
-    pred_h = bhl;
-  }
-  assert(pred_w <= bwl);
-  assert(pred_h <= bhl);
-
-  // visit each subblock in raster order
-  i = 0;
-  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
-    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
-      visit(plane, i, bsize, pred_w, pred_h, arg);
-      i += 1 << pred_w;
+#if CONFIG_MASKED_INTERINTER
    }
-    i += (1 << (bwl + pred_h)) - (1 << bwl);
+#endif
  }
 }
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
+                                    int mi_row,
+                                    int mi_col,
+                                    BLOCK_SIZE_TYPE bsize) {
+  struct build_inter_predictors_args args = {
+    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
+    {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0},
+    {{xd->plane[0].pre[0].buf, NULL, NULL},
+     {xd->plane[0].pre[1].buf, NULL, NULL}},
+    {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}},
+  };

-static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int mi_row, int mi_col,
-                                              int plane_from, int plane_to) {
-  int plane;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    struct build_inter_predictors_args args = {
-      xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    };
-    foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
-                                     &args);
+  foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);
+}
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
+                                     int mi_row,
+                                     int mi_col,
+                                     BLOCK_SIZE_TYPE bsize) {
+  struct build_inter_predictors_args args = {
+    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
+#if CONFIG_ALPHA
+    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+     xd->plane[3].dst.buf},
+    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride,
+     xd->plane[3].dst.stride},
+    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf,
+      xd->plane[3].pre[0].buf},
+     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf,
+      xd->plane[3].pre[1].buf}},
+    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride,
+      xd->plane[3].pre[0].stride},
+     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride,
+      xd->plane[3].pre[1].stride}},
+#else
+    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf},
+    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride},
+    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf},
+     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}},
+    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride},
+     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}},
+#endif
+  };
+  foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);
+}
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   BLOCK_SIZE_TYPE bsize) {
+
+#if CONFIG_INTERINTRA
+  uint8_t *const y = xd->plane[0].dst.buf;
+  uint8_t *const u = xd->plane[1].dst.buf;
+  uint8_t *const v = xd->plane[2].dst.buf;
+  const int y_stride = xd->plane[0].dst.stride;
+  const int uv_stride = xd->plane[1].dst.stride;
+#endif
+  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+  vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
+#if CONFIG_INTERINTRA
+  if (xd->mode_info_context->mbmi.ref_frame[1] == INTRA_FRAME
+      && is_interintra_allowed(xd->mode_info_context->mbmi.sb_type)) {
+    xd->right_available = 0;
+    vp9_build_interintra_predictors(xd, y, u, v,
+                                    y_stride, uv_stride, bsize);
  }
-}
-
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
-}
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
-                                    MAX_MB_PLANE - 1);
-}
-void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
-                                    MAX_MB_PLANE - 1);
+#endif
 }

 // TODO(dkovalev: find better place for this function)
@@ -257,7 +722,8 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
                                      fb->y_crop_width, fb->y_crop_height,
                                      cm->width, cm->height);

-    if (vp9_is_scaled(sf))
+    if (sf->x_scale_fp != VP9_REF_NO_SCALE ||
+        sf->y_scale_fp != VP9_REF_NO_SCALE)
      vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
  }
 }
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -15,19 +15,28 @@
 #include "vp9/common/vp9_onyxc_int.h"

 struct subpix_fn_table;
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize);
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
+                                    int mb_row,
+                                    int mb_col,
+                                    BLOCK_SIZE_TYPE bsize);

-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize);
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
+                                     int mb_row,
+                                     int mb_col,
+                                     BLOCK_SIZE_TYPE bsize);

-void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize);
+void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
+                                   int mb_row, int mb_col,
+                                   BLOCK_SIZE_TYPE bsize);

 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                              INTERPOLATIONFILTERTYPE filter,
                              VP9_COMMON *cm);

+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h);
+
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const MV *mv_q3,
@@ -98,4 +107,11 @@ static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1,

 void vp9_setup_scale_factors(VP9_COMMON *cm, int i);

+#if CONFIG_MASKED_INTERINTER
+void vp9_generate_masked_weight(int mask_index, BLOCK_SIZE_TYPE sb_type,
+                              int h, int w, uint8_t *mask, int stride);
+void vp9_generate_hard_mask(int mask_index, BLOCK_SIZE_TYPE sb_type,
+                          int h, int w, uint8_t *mask, int stride);
+#endif
+
 #endif  // VP9_COMMON_VP9_RECONINTER_H_
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Yue Chen	a5a74224d3	Redesigned recursive filters adapted to block-sizes Recursive intra filters for 4x4 and 8x8 blocks are separately designed. Fixed bugs in rd loop. Change-Id: Id0b1752769f596ce8ea850863cadbc6a739804be	2013-11-04 12:08:19 -08:00
Yue Chen	769ce06eeb	Fixed a bug in recursive extrapolation filter for intra prediction Estimation of local mean, which is used to get zero-mean signals before linear filtering, is corrected. Change-Id: If73d0ae479201fc60a34baa3f15d61e5aecb1162	2013-10-12 18:06:34 -07:00
John Koleszar	fd44975bc8	Make Rand8Extremes more extreme Previous code didn't do what was expected, we want numbers within 16 of the extrema. Change-Id: I20c18627c482ec66e8405ddad74ca9276c0c65dc	2013-09-17 17:45:34 -07:00
Yue Chen	6ce9f36322	New flags for masked compound inter-inter/inter-intra Masked inter-inter will be enabled when CONFIG_MASKED_INTERINTER is on. Masked inter-intra will be enabled only when both CONFIG_MASKED_INTERINTRA and CONFIG_INTERINTRA are on. Change-Id: I57efcfe6a3ef2d53129ef703030366503dfa3762	2013-09-03 17:15:25 -07:00
Yue Chen	8b05d6a248	Masked joint spatio-temporal prediction Exploit wedge partition in joint spatio-temporal prediction. One slice will be intra predicted. The other slice will be inter predicted. Bit-rate reduction: +0.583% derf (+0.307 on top of interintra) +1.298% stdhdraw250 (+0.367% on top of interintra) Change-Id: Iec4bba5a47d0419778458c25b550574a42b3a250	2013-08-27 16:39:08 -07:00
Yue Chen	1a9ef5bcd0	Improved joint spatio-temporal prediction Switch to the correct reference to generate intra component of joint predictions. Change-Id: Ibec72cf53b3be3f7333fe5a29c57e41239b30820	2013-08-19 16:44:47 -07:00
Deb Mukherjee	92fb82a980	Adds sb-type context to probs in interintra expt Adds sb_type context to the probabilities in the interintra experiment. Change-Id: I5dec4318fb859a550ad5e7ed83378e17ba48e8ed	2013-08-15 11:17:50 -07:00
Yue Chen	1306c1b09b	Masked Compound Inter Prediction The masked compound motion compensation has mask types separating a block into wedges at specific angles and offsets. The mask is used to weight pixels from the first and second predictors to obtain the final predictor. The weighting is smooth near the partition boundaries but becomes a selecton farther away. Bit-rate reduction: +0.960%(derfraw300) +0.651%(stdhdraw250) Change-Id: I1327d22d3fc585b72ffa0e03abd90f3980f0876a	2013-08-14 16:56:53 -07:00
Yue Chen	f99fbcd682	Recursive extrapolation filter The recursive intra filter is implemented. 6 extrapolation intra filters are added as extra modes for 4x4 and 8x8 blocks. Signaling bits are added at the block level to indicate if a normal intra mode is switched to recursive intra filter mode. They are entropy coded by maintaining a backward adaptive probability table showing the usage of recursive filters at different block-sizes and different intra modes. Bit-rate reduction: +0.458% (derf) Change-Id: I1b8e00405ea1494002ca40de1db52c51259012c4	2013-08-14 15:59:11 -07:00
Yue Chen	0207fa679f	Merge "Improved joint spatio-temporal prediction" into experimental	2013-08-14 13:47:01 -07:00
Yue Chen	e2bb669a3d	Improved joint spatio-temporal prediction Full search of optimal interintra mode is performed instead of inferring the interintra mode from optimal intra mode. Bit-rate reduction: +0.811% stdhdraw250 +0.238% derf Change-Id: I80e905a51fba0e9fb7eb00a3342d21f452825377	2013-08-14 11:33:40 -07:00
John Koleszar	151ae7ae50	Merge branch 'master' into experimental Conflicts: configure vp9/common/vp9_entropymode.c vp9/common/vp9_onyxc_int.h vp9/common/vp9_reconinter.c vp9/common/vp9_reconintra.c vp9/common/x86/vp9_idct_intrin_sse2.c vp9/decoder/vp9_decodemv.c vp9/decoder/vp9_decodframe.c vp9/encoder/vp9_bitstream.c vp9/encoder/vp9_encodeframe.c vp9/encoder/vp9_onyx_if.c vp9/encoder/vp9_onyx_int.h vp9/encoder/vp9_rdopt.c Change-Id: I2191e8cf074677d6def890720a6b095457efce18	2013-08-07 14:44:06 -07:00
Yue Chen	4dd3b07478	Merge origin/master into experimental Change-Id: I53dc16716ff02f35089df1aeb3e5eeb825271dab	2013-08-07 13:43:49 -07:00
Yue Chen	35abb70353	Joint Spatio-temporal Prediction: Updated A new prediction scheme is implemented within July 3 experimental branch exploiting both motion compensation and intra prediction in Inter frames. Bit-rate reduction: derf: +0.147% (on July 3 head) stdhd: +0.591% (on June 25 head) Change-Id: Iec3d97afaad6fa99881187228971a405c3d2ec88	2013-07-30 09:18:43 -07:00
John Koleszar	5a70b23158	Merge remote-tracking branch 'origin/master' into experimental Change-Id: Ifda4ce2647cd79b87e8450fbaf79c59165b8388f	2013-07-03 11:32:58 -07:00
John Koleszar	d9879e2c73	Merge remote-tracking branch 'origin/master' into experimental Conflicts: vp9/common/vp9_rtcd_defs.sh vp9/encoder/vp9_encodeframe.c Change-Id: I365fb9e78a550c68aa9caca7fff84af43526b439	2013-06-28 09:42:40 -07:00
Ronald S. Bultje	9536db22cd	Merge "Only do metrics on cropped (visible) area of picture." into experimental	2013-06-25 12:02:27 -07:00
Ronald S. Bultje	3904505d8e	Merge "Don't skip right/bottom border pixels in SSIM calculations." into experimental	2013-06-25 12:02:23 -07:00
sujee	01c43e86d9	Trivial change to add crude timing information for encoding section Change-Id: I84e07c2f1240b95d5de083df06eb3d581bfb9b68	2013-06-18 14:58:31 -07:00
Scott LaVarnway	885d8a4397	Eliminated prev_mip memsets/memcpys in encoder This patch swaps ptrs instead of copying and uses the last show_frame flag instead of setting the entire buffer to zero. On the decode side, forced prev_mi to always point to a valid mode info buffer. Required by the next frame. Change-Id: I90441eaf087868e9f9bc368e15e0becc848c4a51	2013-06-18 14:42:31 -04:00
Jingning Han	8d139a5d29	Merge "Enable sse2 version of sad8x4/4x8" into experimental	2013-06-14 13:16:13 -07:00
Christian Duvivier	54f86290b6	Remove copy stages in vp9_short_idct32x32. Add another set of internal variables to allow removal of all the copy stages. Change-Id: I2e1cf36b7d057fbb7515fce737f7eee391edf842	2013-06-13 16:12:21 -07:00
Ronald S. Bultje	538e97ffd8	Fix row tiling. Change-Id: I57be4eeaea6e4402f6a0cc04f5c6b7a5d9aedf9b	2013-06-12 10:30:06 -07:00
Ronald S. Bultje	70dd502153	Merge "Implement SSE version for sad4x8x4d and SSE2 version for sad8x4x4d." into experimental	2013-06-11 18:33:31 -07:00
Ronald S. Bultje	10eb64ab9f	Implement SSE version for sad4x8x4d and SSE2 version for sad8x4x4d. Encoding time of crew (CIF, first 50 frames) @ 1500kbps goes from 4min56 to 4min42. Change-Id: I92c0c8b32980d2ae7c6dafc8b883a2c7fcd14a9f	2013-06-11 15:34:21 -07:00
Ronald S. Bultje	df50e5c01a	Only do metrics on cropped (visible) area of picture. The part where we align it by 8 or 16 is an implementation detail that shouldn't matter to the outside world. Change-Id: I9edd6f08b51b31c839c0ea91f767640bccb08d53	2013-06-10 11:47:22 -07:00
Ronald S. Bultje	2ec602c8e2	Don't skip right/bottom border pixels in SSIM calculations. Change-Id: I75acb55ade54bef6ad7703ed5e691581fa2f8fe1	2013-06-10 11:36:04 -07:00