libvpx: enable building for iOS devices (armv7)

Allow output of gas syntax assembly directly from obj_int_extract Change-Id: I33a747e87ef1c590a8766dea17f8cb2497e54591
Replace generated quant tables with static lookup tables.
2013-07-19 14:05:59 -07:00 · 2013-07-16 14:04:41 -07:00 · 2013-07-16 14:04:41 -07:00 · 2013-07-16 14:04:39 -07:00 · 2013-07-16 12:41:10 -07:00 · 2013-07-16 12:40:48 -07:00
210 changed files with 16220 additions and 24565 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,6 @@
 *.a
 *.asm.s
 *.d
-*.gcno
-*.gcda
 *.o
 *~
 /*.ivf
@@ -16,7 +14,7 @@
 /.install-*
 /.libs
 /Makefile
-/config.log
+/config.err
 /config.mk
 /decode_to_md5
 /decode_to_md5.c
--- a/36
+++ b/36
@@ -1,7 +1,7 @@
 vpx Multi-Format Codec SDK
-README - 1 August 2013
+README - 21 June 2012

-Welcome to the WebM VP8/VP9 Codec SDK!
+Welcome to the WebM VP8 Codec SDK!

 COMPILING THE APPLICATIONS/LIBRARIES:
  The build system used is similar to autotools. Building generally consists of
@@ -53,63 +53,33 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv5te-android-gcc
    armv5te-linux-rvct
    armv5te-linux-gcc
-    armv5te-none-rvct
    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
-    armv6-none-rvct
    armv7-android-gcc
-    armv7-darwin-gcc
    armv7-linux-rvct
    armv7-linux-gcc
-    armv7-none-rvct
-    armv7-win32-vs11
    mips32-linux-gcc
    ppc32-darwin8-gcc
    ppc32-darwin9-gcc
-    ppc32-linux-gcc
    ppc64-darwin8-gcc
    ppc64-darwin9-gcc
    ppc64-linux-gcc
-    sparc-solaris-gcc
-    x86-android-gcc
    x86-darwin8-gcc
    x86-darwin8-icc
    x86-darwin9-gcc
    x86-darwin9-icc
-    x86-darwin10-gcc
-    x86-darwin11-gcc
-    x86-darwin12-gcc
-    x86-darwin13-gcc
    x86-linux-gcc
    x86-linux-icc
-    x86-os2-gcc
    x86-solaris-gcc
-    x86-win32-gcc
    x86-win32-vs7
    x86-win32-vs8
-    x86-win32-vs9
-    x86-win32-vs10
-    x86-win32-vs11
    x86_64-darwin9-gcc
-    x86_64-darwin10-gcc
-    x86_64-darwin11-gcc
-    x86_64-darwin12-gcc
-    x86_64-darwin13-gcc
    x86_64-linux-gcc
-    x86_64-linux-icc
    x86_64-solaris-gcc
-    x86_64-win64-gcc
    x86_64-win64-vs8
-    x86_64-win64-vs9
-    x86_64-win64-vs10
-    x86_64-win64-vs11
    universal-darwin8-gcc
    universal-darwin9-gcc
-    universal-darwin10-gcc
-    universal-darwin11-gcc
-    universal-darwin12-gcc
-    universal-darwin13-gcc
    generic-gnu

  The generic-gnu target, in conjunction with the CROSS environment variable,
@@ -127,7 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:

  5. Configuration errors
  If the configuration step fails, the first step is to look in the error log.
-  This defaults to config.log. This should give a good indication of what went
+  This defaults to config.err. This should give a good indication of what went
  wrong. If not, contact us for support.

 SUPPORT
--- a/build/arm-msvs/obj_int_extract.bat
+++ b/build/arm-msvs/obj_int_extract.bat
@@ -7,7 +7,18 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on

+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/common/vp9_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/decoder/vp9_asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/encoder/vp9_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
+
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/common/vp8_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"

 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -75,7 +75,7 @@ Options:

 Build options:
  --help                      print this message
-  --log=yes|no|FILE           file configure log is written to [config.log]
+  --log=yes|no|FILE           file configure log is written to [config.err]
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
@@ -653,10 +653,6 @@ process_common_toolchain() {
                tgt_isa=x86_64
                tgt_os=darwin12
                ;;
-            *darwin13*)
-                tgt_isa=x86_64
-                tgt_os=darwin13
-                ;;
            x86_64*mingw32*)
                tgt_os=win64
                ;;
@@ -755,10 +751,6 @@ process_common_toolchain() {
            add_cflags  "-mmacosx-version-min=10.8"
            add_ldflags "-mmacosx-version-min=10.8"
            ;;
-        *-darwin13-*)
-            add_cflags  "-mmacosx-version-min=10.9"
-            add_ldflags "-mmacosx-version-min=10.9"
-            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -1189,12 +1181,6 @@ EOF
        fi
    fi

-    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
-    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then
-      soft_enable use_x86inc
-    fi
-
    # Position Independent Code (PIC) support, for building relocatable
    # shared objects
    enabled gcc && enabled pic && check_add_cflags -fPIC
@@ -1310,7 +1296,7 @@ process_detect() {
 }

 enable logging
-logfile="config.log"
+logfile="config.err"
 self=$0
 process() {
    cmdline_args="$@"
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -381,7 +381,7 @@ generate_vcproj() {
                            RuntimeLibrary="$debug_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
+                            DebugInformationFormat="1" \
                            $warn_64bit \

                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
@@ -395,7 +395,7 @@ generate_vcproj() {
                            RuntimeLibrary="$debug_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
+                            DebugInformationFormat="1" \
                            $warn_64bit \

                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -72,21 +72,10 @@ parse_project() {
    eval "${var}_name=$name"
    eval "${var}_guid=$guid"

-    if [ "$sfx" = "vcproj" ]; then
-        cur_config_list=`grep -A1 '<Configuration' $file |
-            grep Name | cut -d\" -f2`
-    else
-        cur_config_list=`grep -B1 'Label="Configuration"' $file |
-            grep Condition | cut -d\' -f4`
-    fi
-    new_config_list=$(for i in $config_list $cur_config_list; do
-        echo $i
-    done | sort | uniq)
-    if [ "$config_list" != "" ] && [ "$config_list" != "$new_config_list" ]; then
-        mixed_platforms=1
-    fi
-    config_list="$new_config_list"
-    eval "${var}_config_list=\"$cur_config_list\""
+    # assume that all projects have the same list of possible configurations,
+    # so overwriting old config_lists is not a problem
+    config_list=`grep -A1 '<Configuration' $file |
+        grep Name | cut -d\" -f2`
    proj_list="${proj_list} ${var}"
 }

@@ -136,11 +125,6 @@ process_global() {
    indent_push
    IFS_bak=${IFS}
    IFS=$'\r'$'\n'
-    if [ "$mixed_platforms" != "" ]; then
-        config_list="
-Release|Mixed Platforms
-Debug|Mixed Platforms"
-    fi
    for config in ${config_list}; do
        echo "${indent}$config = $config"
    done
@@ -155,17 +139,10 @@ Debug|Mixed Platforms"
    indent_push
    for proj in ${proj_list}; do
        eval "local proj_guid=\${${proj}_guid}"
-        eval "local proj_config_list=\${${proj}_config_list}"
        IFS=$'\r'$'\n'
-        for config in ${proj_config_list}; do
-            if [ "$mixed_platforms" != "" ]; then
-                local c=${config%%|*}
-                echo "${indent}${proj_guid}.${c}|Mixed Platforms.ActiveCfg = ${config}"
-                echo "${indent}${proj_guid}.${c}|Mixed Platforms.Build.0 = ${config}"
-            else
-                echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
-                echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
-            fi
+        for config in ${config_list}; do
+            echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
+            echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"

        done
        IFS=${IFS_bak}
@@ -191,14 +168,9 @@ process_makefile() {
    IFS=$'\r'$'\n'
    local TAB=$'\t'
    cat <<EOF
-ifeq (\$(CONFIG_VS_VERSION),7)
-MSBUILD_TOOL := devenv.com
-else
-MSBUILD_TOOL := msbuild.exe
-endif
-found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
+found_devenv := \$(shell which devenv.com >/dev/null 2>&1 && echo yes)
 .nodevenv.once:
-${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
+${TAB}@echo "  * devenv.com not found in path."
 ${TAB}@echo "  * "
 ${TAB}@echo "  * You will have to build all configurations manually using the"
 ${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
@@ -223,17 +195,16 @@ ${TAB}rm -rf "$platform"/"$config"
 ifneq (\$(found_devenv),)
  ifeq (\$(CONFIG_VS_VERSION),7)
 $nows_sln_config: $outfile
-${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"
+${TAB}devenv.com $outfile -build "$config"

  else
 $nows_sln_config: $outfile
-${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
-${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
+${TAB}devenv.com $outfile -build "$sln_config"

  endif
 else
 $nows_sln_config: $outfile .nodevenv.once
-${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
+${TAB}@echo "  * Skipping build of $sln_config (devenv.com not in path)."
 ${TAB}@echo "  * "
 endif

--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -7,6 +7,17 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on

+cl /I "./" /I "%1" /nologo /c "%1/vp9/common/vp9_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/vp9_asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/vp9_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
+
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"

--- a/18
+++ b/18
@@ -115,7 +115,6 @@ all_platforms="${all_platforms} x86-darwin9-icc"
 all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
-all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
@@ -130,7 +129,6 @@ all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
-all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -144,7 +142,6 @@ all_platforms="${all_platforms} universal-darwin9-gcc"
 all_platforms="${all_platforms} universal-darwin10-gcc"
 all_platforms="${all_platforms} universal-darwin11-gcc"
 all_platforms="${all_platforms} universal-darwin12-gcc"
-all_platforms="${all_platforms} universal-darwin13-gcc"
 all_platforms="${all_platforms} generic-gnu"

 # all_targets is a list of all targets that can be configured
@@ -250,10 +247,7 @@ EXPERIMENT_LIST="
    multiple_arf
    non420
    alpha
-    interintra
-    filterintra
-    masked_interintra
-    masked_interinter
+    balanced_coeftree
 "
 CONFIG_LIST="
    external_build
@@ -261,7 +255,6 @@ CONFIG_LIST="
    install_bins
    install_libs
    install_srcs
-    use_x86inc
    debug
    gprof
    gcov
@@ -318,7 +311,6 @@ CMDLINE_SELECT="
    gprof
    gcov
    pic
-    use_x86inc
    optimizations
    ccache
    runtime_cpu_detect
@@ -690,14 +682,6 @@ process_toolchain() {
            # iOS/ARM builds do not work with gtest. This does not match
            # x86 targets.
        ;;
-        *-win*)
-            # Some mingw toolchains don't have pthread available by default.
-            # Treat these more like visual studio where threading in gtest
-            # would be disabled for the same reason.
-            check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-        ;;
        *)
            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
--- a/libs.mk
+++ b/libs.mk
@@ -57,13 +57,6 @@ CLEAN-OBJS += $$(BUILD_PFX)$(1).h
 RTCD += $$(BUILD_PFX)$(1).h
 endef

-# x86inc.asm is not compatible with pic 32bit builds. Restrict
-# files which use it to 64bit builds or 32bit without pic
-USE_X86INC = no
-ifeq ($(CONFIG_USE_X86INC),yes)
-  USE_X86INC = yes
-endif
-
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk

@@ -390,11 +383,6 @@ LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                     $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)

-libvpx_test_srcs.txt:
-	@echo "    [CREATE] $@"
-	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
-CLEAN-OBJS += libvpx_test_srcs.txt
-
 $(LIBVPX_TEST_DATA):
 	@echo "    [DOWNLOAD] $@"
 	$(qexec)trap 'rm -f $@' INT TERM &&\
@@ -455,10 +443,6 @@ else
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
 GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
 GTEST_OBJS=$(call objs,$(GTEST_SRCS))
-ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
-# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
-endif
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
@@ -483,7 +467,7 @@ $(foreach bin,$(LIBVPX_TEST_BINS),\
        lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
    $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
        $(LIBVPX_TEST_OBJS) \
-        -L. -lvpx -lgtest $(extralibs) -lm)\
+        -L. -lvpx -lgtest -lpthread -lm)\
        )))\
    $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\

--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -38,7 +38,7 @@ class ACMRandom {
    // Returns a random value near 0 or near 255, to better exercise
    // saturation behavior.
    const uint8_t r = Rand8();
-    return r <= 128 ? 255 - (r >> 4) : r >> 4;
+    return r < 128 ? r << 4 : r >> 4;
  }

  int PseudoUniform(int range) {
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -33,6 +33,10 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
    altref_count_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -27,6 +27,10 @@ class BordersTest : public ::libvpx_test::EncoderTest,
    SetMode(GET_PARAM(1));
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if ( video->frame() == 1) {
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -134,14 +134,14 @@ class VP8CodecFactory : public CodecFactory {

 const libvpx_test::VP8CodecFactory kVP8;

-#define VP8_INSTANTIATE_TEST_CASE(test, ...)\
+#define VP8_INSTANTIATE_TEST_CASE(test, params)\
  INSTANTIATE_TEST_CASE_P(VP8, test, \
      ::testing::Combine( \
          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
              &libvpx_test::kVP8)), \
-          __VA_ARGS__))
+          params))
 #else
-#define VP8_INSTANTIATE_TEST_CASE(test, ...)
+#define VP8_INSTANTIATE_TEST_CASE(test, params)
 #endif  // CONFIG_VP8


@@ -216,14 +216,14 @@ class VP9CodecFactory : public CodecFactory {

 const libvpx_test::VP9CodecFactory kVP9;

-#define VP9_INSTANTIATE_TEST_CASE(test, ...)\
+#define VP9_INSTANTIATE_TEST_CASE(test, params)\
  INSTANTIATE_TEST_CASE_P(VP9, test, \
      ::testing::Combine( \
          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
               &libvpx_test::kVP9)), \
-          __VA_ARGS__))
+          params))
 #else
-#define VP9_INSTANTIATE_TEST_CASE(test, ...)
+#define VP9_INSTANTIATE_TEST_CASE(test, params)
 #endif  // CONFIG_VP9


--- a/test/config_test.cc
+++ b/test/config_test.cc
@@ -40,6 +40,10 @@ class ConfigTest : public ::libvpx_test::EncoderTest,
    ++frame_count_out_;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  unsigned int frame_count_in_;
  unsigned int frame_count_out_;
  unsigned int frame_count_max_;
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -22,8 +22,8 @@ extern "C" {
 }

 namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
                              const int16_t *filter_y, int filter_y_stride,
                              int w, int h);
@@ -211,7 +211,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

  virtual void SetUp() {
    UUT_ = GET_PARAM(2);
-    /* Set up guard blocks for an inner block centered in the outer block */
+    /* Set up guard blocks for an inner block cetered in the outer block */
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
        output_[i] = 255;
@@ -527,9 +527,9 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(

 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
-    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
-    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
-    vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3);
+    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,
+    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,
+    vp9_convolve8_ssse3, vp9_convolve8_avg_c);

 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
    make_tuple(4, 4, &convolve8_ssse3),
@@ -546,26 +546,4 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
    make_tuple(32, 64, &convolve8_ssse3),
    make_tuple(64, 64, &convolve8_ssse3)));
 #endif
-
-#if HAVE_NEON
-const ConvolveFunctions convolve8_neon(
-    vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
-    vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
-    vp9_convolve8_neon, vp9_convolve8_avg_neon);
-
-INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_neon),
-    make_tuple(8, 4, &convolve8_neon),
-    make_tuple(4, 8, &convolve8_neon),
-    make_tuple(8, 8, &convolve8_neon),
-    make_tuple(16, 8, &convolve8_neon),
-    make_tuple(8, 16, &convolve8_neon),
-    make_tuple(16, 16, &convolve8_neon),
-    make_tuple(32, 16, &convolve8_neon),
-    make_tuple(16, 32, &convolve8_neon),
-    make_tuple(32, 32, &convolve8_neon),
-    make_tuple(64, 32, &convolve8_neon),
-    make_tuple(32, 64, &convolve8_neon),
-    make_tuple(64, 64, &convolve8_neon)));
-#endif
 }  // namespace
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -1,112 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include <climits>
-#include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-class CpuSpeedTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWith2Params<
-        libvpx_test::TestMode, int> {
- protected:
-  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-  }
-
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
-      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
-      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
-    }
-  }
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
-    }
-  }
-  int set_cpu_used_;
-};
-
-TEST_P(CpuSpeedTest, TestQ0) {
-  // Validate that this non multiple of 64 wide clip encodes and decodes
-  // without a mismatch when passing in a very low max q.  This pushes
-  // the encoder to producing lots of big partitions which will likely
-  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
-  cfg_.rc_target_bitrate = 400;
-  cfg_.rc_max_quantizer = 0;
-  cfg_.rc_min_quantizer = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-
-TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
-  // Validate that this non multiple of 64 wide clip encodes and decodes
-  // without a mismatch when passing in a very low max q.  This pushes
-  // the encoder to producing lots of big partitions which will likely
-  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
-  cfg_.rc_target_bitrate = 12000;
-  cfg_.rc_max_quantizer = 10;
-  cfg_.rc_min_quantizer = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       40);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-TEST_P(CpuSpeedTest, TestLowBitrate) {
-  // Validate that this clip encodes and decodes without a mismatch
-  // when passing in a very high min q.  This pushes the encoder to producing
-  // lots of small partitions which might will test the other condition.
-
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.rc_min_quantizer = 40;
-
-  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       40);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-using std::tr1::make_tuple;
-
-#define VP9_FACTORY \
-  static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
-
-VP9_INSTANTIATE_TEST_CASE(
-    CpuSpeedTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Range(0, 3));
-}  // namespace
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -42,6 +42,10 @@ class CQTest : public ::libvpx_test::EncoderTest,
    n_frames_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -36,6 +36,10 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    duration_ = 0.0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    const vpx_rational_t tb = video->timebase();
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -13,7 +13,6 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_ports/mem.h"

 extern "C" {
 #include "vp9/common/vp9_entropy.h"
@@ -265,131 +264,6 @@ void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
  }
 }

-void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-               int stride, int /*tx_type*/) {
-  vp9_short_fdct16x16_c(in, out, stride);
-}
-void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                   int stride, int /*tx_type*/) {
-  vp9_short_idct16x16_add_c(out, dst, stride >> 1);
-}
-void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-              int stride, int tx_type) {
-  // FIXME(jingning): need to test both SSE2 and c
-#if HAVE_SSE2
-  vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
-#else
-  vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
-#endif
-}
-void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-  vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
-}
-
-class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
- public:
-  virtual ~FwdTrans16x16Test() {}
-
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm = fdct16x16;
-      inv_txfm = idct16x16_add;
-    } else {
-      fwd_txfm = fht16x16;
-      inv_txfm = iht16x16_add;
-    }
-  }
-
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
-  }
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
-  }
-
-  int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-};
-
-TEST_P(FwdTrans16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 10000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
-
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      // Initialize a test block with input range [-255, 255].
-      test_input_block[j] = src[j] - dst[j];
-    }
-
-    const int pitch = 32;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1, max_error)
-      << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
-
-  EXPECT_GE(count_test_block , total_error)
-      << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
-}
-
-TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 256; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 32;
-    RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
-    RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 256; ++j) {
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 16x16 FDCT extreme has coefficient larger "
-          << "than 4*DCT_MAX_VALUE";
-    }
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));

 TEST(VP9Idct16x16Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -421,4 +295,72 @@ TEST(VP9Idct16x16Test, AccuracyCheck) {
  }
 }

+// we need enable fdct test once we re-do the 16 point fdct.
+TEST(VP9Fdct16x16Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[256];
+    int16_t test_temp_block[256];
+    uint8_t dst[256], src[256];
+
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j)
+      test_input_block[j] = src[j] - dst[j];
+
+    const int pitch = 32;
+    vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
+
+    for (int j = 0; j < 256; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1, max_error)
+      << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1";
+
+  EXPECT_GE(count_test_block , total_error)
+      << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block";
+}
+
+TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[256], input_extreme_block[256];
+    int16_t output_block[256], output_extreme_block[256];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 256; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 32;
+    vp9_short_fdct16x16_c(input_block, output_block, pitch);
+    vp9_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 256; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";
+    }
+  }
+}
 }  // namespace
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -190,9 +190,7 @@ class EncoderTest {
  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}

  // Hook to determine whether the encode loop should continue.
-  virtual bool Continue() const {
-    return !(::testing::Test::HasFatalFailure() || abort_);
-  }
+  virtual bool Continue() const { return !abort_; }

  const CodecFactory   *codec_;
  // Hook to determine whether to decode frame after encoding
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -50,6 +50,10 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    mismatch_nframes_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
    psnr_ += pkt->data.psnr.psnr[0];
    nframes_++;
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -20,64 +20,63 @@ extern "C" {

 #include "acm_random.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"

 using libvpx_test::ACMRandom;

 namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-             int stride, int /*tx_type*/) {
+void fdct4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
  vp9_short_fdct4x4_c(in, out, stride);
 }
-void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                 int stride, int /*tx_type*/) {
+void idct4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
+                 int stride, int tx_type) {
  vp9_short_idct4x4_add_c(out, dst, stride >> 1);
 }
-void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-            int stride, int tx_type) {
+void fht4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
  vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
 }
-void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+void iht4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
                int stride, int tx_type) {
  vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
 }

 class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
 public:
-  virtual ~FwdTrans4x4Test() {}
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm_ = fdct4x4;
-      inv_txfm_ = idct4x4_add;
+  FwdTrans4x4Test() {SetUpTestTxfm();}
+  ~FwdTrans4x4Test() {}
+
+  void SetUpTestTxfm() {
+    tx_type = GetParam();
+    if (tx_type == 0) {
+      fwd_txfm = fdct4x4;
+      inv_txfm = idct4x4_add;
    } else {
-      fwd_txfm_ = fht4x4;
-      inv_txfm_ = iht4x4_add;
+      fwd_txfm = fht4x4;
+      inv_txfm = iht4x4_add;
    }
  }

 protected:
  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                  int stride, int tx_type) {
-    (*fwd_txfm_)(in, out, dst, stride, tx_type);
+    (*fwd_txfm)(in, out, dst, stride, tx_type);
  }

  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                  int stride, int tx_type) {
-    (*inv_txfm_)(in, out, dst, stride, tx_type);
+    (*inv_txfm)(in, out, dst, stride, tx_type);
  }

-  int tx_type_;
-  void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+  int tx_type;
+  void (*fwd_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type);
-  void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+  void (*inv_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type);
 };

 TEST_P(FwdTrans4x4Test, SignBiasCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
+  int16_t test_input_block[16];
+  int16_t test_output_block[16];
  const int pitch = 8;
  int count_sign_block[16][2];
  const int count_test_block = 1000000;
@@ -88,7 +87,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    for (int j = 0; j < 16; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0)
@@ -104,7 +103,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    EXPECT_TRUE(bias_acceptable)
        << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
        << " for input range [-255, 255] at index " << j
-        << " tx_type " << tx_type_;
+        << " tx_type " << tx_type;
  }

  memset(count_sign_block, 0, sizeof(count_sign_block));
@@ -113,7 +112,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    for (int j = 0; j < 16; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0)
@@ -139,10 +138,9 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
  double total_error = 0;
  const int count_test_block = 1000000;
  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
+    int16_t test_input_block[16];
+    int16_t test_temp_block[16];
+    uint8_t dst[16], src[16];

    for (int j = 0; j < 16; ++j) {
      src[j] = rnd.Rand8();
@@ -153,7 +151,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 8;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
        if(test_temp_block[j] > 0) {
@@ -168,7 +166,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
    }

    // inverse transform and reconstruct the pixel block
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
      const int diff = dst[j] - src[j];
@@ -183,7 +181,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {

  EXPECT_GE(count_test_block, total_error)
      << "Error: FDCT/IDCT or FHT/IHT has average "
-      << "roundtrip error > 1 per block";
+          "roundtrip error > 1 per block";
 }

 INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,7 +13,6 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_ports/mem.h"

 extern "C" {
 #include "vp9_rtcd.h"
@@ -26,62 +25,11 @@ void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
 using libvpx_test::ACMRandom;

 namespace {
-void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-             int stride, int /*tx_type*/) {
-  vp9_short_fdct8x8_c(in, out, stride);
-}
-void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                 int stride, int /*tx_type*/) {
-  vp9_short_idct8x8_add_c(out, dst, stride >> 1);
-}
-void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-            int stride, int tx_type) {
-  // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
-  // when we have all inverse dct functions done sse2.
-#if HAVE_SSE2
-  vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
-#else
-  vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
-#endif
-}
-void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                int stride, int tx_type) {
-  vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
-}

-class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
- public:
-  virtual ~FwdTrans8x8Test() {}
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm = fdct8x8;
-      inv_txfm = idct8x8_add;
-    } else {
-      fwd_txfm = fht8x8;
-      inv_txfm = iht8x8_add;
-    }
-  }
-
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
-  }
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
-  }
-
-  int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-};
-
-TEST_P(FwdTrans8x8Test, SignBiasCheck) {
+TEST(VP9Fdct8x8Test, SignBiasCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
+  int16_t test_input_block[64];
+  int16_t test_output_block[64];
  const int pitch = 16;
  int count_sign_block[64][2];
  const int count_test_block = 100000;
@@ -93,7 +41,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -107,7 +55,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
    const int max_diff = 1125;
    EXPECT_LT(diff, max_diff)
-        << "Error: 8x8 FDCT/FHT has a sign bias > "
+        << "Error: 8x8 FDCT has a sign bias > "
        << 1. * max_diff / count_test_block * 100 << "%"
        << " for input range [-255, 255] at index " << j
        << " count0: " << count_sign_block[j][0]
@@ -122,7 +70,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -136,25 +84,24 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
    const int max_diff = 10000;
    EXPECT_LT(diff, max_diff)
-        << "Error: 4x4 FDCT/FHT has a sign bias > "
+        << "Error: 4x4 FDCT has a sign bias > "
        << 1. * max_diff / count_test_block * 100 << "%"
        << " for input range [-15, 15] at index " << j
        << " count0: " << count_sign_block[j][0]
        << " count1: " << count_sign_block[j][1]
        << " diff: " << diff;
  }
-}
+};

-TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
+TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
  double total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    int16_t test_input_block[64];
+    int16_t test_temp_block[64];
+    uint8_t dst[64], src[64];

    for (int j = 0; j < 64; ++j) {
      src[j] = rnd.Rand8();
@@ -165,7 +112,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
    for (int j = 0; j < 64; ++j){
        if(test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
@@ -177,7 +124,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
          test_temp_block[j] *= 4;
        }
    }
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -189,23 +136,21 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
  }

  EXPECT_GE(1, max_error)
-    << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
+      << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1";

  EXPECT_GE(count_test_block/5, total_error)
-    << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
-        "error > 1/5 per block";
-}
+      << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block";
+};

-TEST_P(FwdTrans8x8Test, ExtremalCheck) {
+TEST(VP9Fdct8x8Test, ExtremalCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
  double total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    int16_t test_input_block[64];
+    int16_t test_temp_block[64];
+    uint8_t dst[64], src[64];

    for (int j = 0; j < 64; ++j) {
      src[j] = rnd.Rand8() % 2 ? 255 : 0;
@@ -216,8 +161,8 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -228,14 +173,13 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
    }

    EXPECT_GE(1, max_error)
-        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an"
+        << "Error: Extremal 8x8 FDCT/IDCT has an"
        << " individual roundtrip error > 1";

    EXPECT_GE(count_test_block/5, total_error)
-        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
+        << "Error: Extremal 8x8 FDCT/IDCT has average"
        << " roundtrip error > 1/5 per block";
  }
-}
+};

-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4));
 }  // namespace
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -49,7 +49,7 @@ class I420VideoSource : public VideoSource {
    if (input_file_)
      fclose(input_file_);
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
        << file_name_;
    if (start_) {
      fseek(input_file_, raw_sz_ * start_, SEEK_SET);
@@ -92,7 +92,6 @@ class I420VideoSource : public VideoSource {
  }

  virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
    // Read a frame from input_file.
    if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
      limit_ = frame_;
@@ -109,8 +108,8 @@ class I420VideoSource : public VideoSource {
  unsigned int frame_;
  unsigned int width_;
  unsigned int height_;
-  int framerate_numerator_;
-  int framerate_denominator_;
+  unsigned int framerate_numerator_;
+  unsigned int framerate_denominator_;
 };

 }  // namespace libvpx_test
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+
 extern "C" {
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
@@ -21,94 +22,100 @@ typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
                          int dst_stride);
 namespace {
 class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
- protected:
-  virtual void SetUp() {
-    int i;
+  protected:
+    virtual void SetUp() {
+        int i;

-    UUT = GetParam();
-    memset(input, 0, sizeof(input));
-    /* Set up guard blocks */
-    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
-  }
+        UUT = GetParam();
+        memset(input, 0, sizeof(input));
+        /* Set up guard blocks */
+        for (i = 0; i < 256; i++)
+            output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+    }

-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+    virtual void TearDown() {
+      libvpx_test::ClearSystemState();
+    }

-  idct_fn_t UUT;
-  short input[16];
-  unsigned char output[256];
-  unsigned char predict[256];
+    idct_fn_t UUT;
+    short input[16];
+    unsigned char output[256];
+    unsigned char predict[256];
 };

 TEST_P(IDCTTest, TestGuardBlocks) {
-  int i;
+    int i;

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << i;
-    else
-      EXPECT_EQ(255, output[i]);
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(0, output[i]) << i;
+        else
+            EXPECT_EQ(255, output[i]);
 }

 TEST_P(IDCTTest, TestAllZeros) {
-  int i;
+    int i;

-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(0, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestAllOnes) {
-  int i;
+    int i;

-  input[0] = 4;
-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+    input[0] = 4;
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestAddOne) {
-  int i;
+    int i;

-  for (i = 0; i < 256; i++) predict[i] = i;
-  input[0] = 4;
-  REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
+    for (i = 0; i < 256; i++)
+        predict[i] = i;
+    input[0] = 4;
+    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(i+1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestWithData) {
-  int i;
+    int i;

-  for (i = 0; i < 16; i++) input[i] = i;
+    for (i = 0; i < 16; i++)
+        input[i] = i;

-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) > 3 || i > 63)
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-    else if (i == 0)
-      EXPECT_EQ(11, output[i]) << "i==" << i;
-    else if (i == 34)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else if (i == 2 || i == 17 || i == 32)
-      EXPECT_EQ(3, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(0, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) > 3 || i > 63)
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+        else if (i == 0)
+            EXPECT_EQ(11, output[i]) << "i==" << i;
+        else if (i == 34)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else if (i == 2 || i == 17 || i == 32)
+            EXPECT_EQ(3, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(0, output[i]) << "i==" << i;
 }

-INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+INSTANTIATE_TEST_CASE_P(C, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_c));
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_mmx));
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -27,8 +27,6 @@ using libvpx_test::ACMRandom;

 class IntraPredBase {
 public:
-  virtual ~IntraPredBase() {}
-
  virtual void TearDown() {
    libvpx_test::ClearSystemState();
  }
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -47,13 +47,12 @@ class IVFVideoSource : public CompressedVideoSource {
  virtual void Init() {
    // Allocate a buffer for read in the compressed video frame.
    compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed";
  }

  virtual void Begin() {
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
        << file_name_;

    // Read file header
@@ -73,7 +72,6 @@ class IVFVideoSource : public CompressedVideoSource {
  }

  void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
    uint8_t frame_hdr[kIvfFrameHdrSize];
    // Check frame header and read a frame from input_file.
    if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -31,6 +31,10 @@ class KeyframeTest : public ::libvpx_test::EncoderTest,
    set_cpu_used_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (kf_do_force_kf_)
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -70,6 +70,10 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
    SetMode(GET_PARAM(1));
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -428,7 +428,6 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));

 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
 const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
@@ -442,7 +441,6 @@ INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
                        make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
 #endif
-#endif

 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
@@ -453,20 +451,14 @@ const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
 #endif
 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
-const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
-const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
 const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
-const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
-const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
 const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
 const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
+const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
 const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
 #endif
-#endif
 const sad_m_by_n_test_param_t sse2_tests[] = {
 #if CONFIG_VP8_ENCODER
  make_tuple(16, 16, sad_16x16_wmt),
@@ -476,25 +468,18 @@ const sad_m_by_n_test_param_t sse2_tests[] = {
  make_tuple(4, 4, sad_4x4_wmt),
 #endif
 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
  make_tuple(64, 64, sad_64x64_sse2_vp9),
-  make_tuple(64, 32, sad_64x32_sse2_vp9),
-  make_tuple(32, 64, sad_32x64_sse2_vp9),
  make_tuple(32, 32, sad_32x32_sse2_vp9),
-  make_tuple(32, 16, sad_32x16_sse2_vp9),
-  make_tuple(16, 32, sad_16x32_sse2_vp9),
  make_tuple(16, 16, sad_16x16_sse2_vp9),
-  make_tuple(16, 8, sad_16x8_sse2_vp9),
  make_tuple(8, 16, sad_8x16_sse2_vp9),
+  make_tuple(16, 8, sad_16x8_sse2_vp9),
  make_tuple(8, 8, sad_8x8_sse2_vp9),
  make_tuple(8, 4, sad_8x4_sse2_vp9),
 #endif
-#endif
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));

 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
@@ -520,7 +505,6 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
                        make_tuple(8, 4, sad_8x4x4d_sse2)));
 #endif
 #endif
-#endif

 #if HAVE_SSE3
 #if CONFIG_VP8_ENCODER
@@ -539,11 +523,9 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
 #endif

 #if HAVE_SSSE3
-#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                        make_tuple(16, 16, sad_16x16_sse3)));
 #endif
-#endif

 }  // namespace
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -61,7 +61,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
    int16_t *src_diff = be.src_diff;
    for (int r = 0; r < kBlockHeight; ++r) {
      for (int c = 0; c < kBlockWidth; ++c) {
-        src_diff[c] = static_cast<int16_t>(0xa5a5);
+        src_diff[c] = 0xa5a5;
      }
      src_diff += kDiffPredStride;
    }
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -33,6 +33,10 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
    delete[] modified_buf_;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/test.mk
+++ b/test/test.mk
@@ -25,8 +25,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
@@ -89,7 +87,6 @@ LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 endif

 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -181,7 +181,6 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,

  virtual void DecompressedFrameHook(const vpx_image_t& img,
                                     const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
    char expected_md5[33];
    char junk[128];

--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -23,13 +23,10 @@ extern "C" {

 namespace {
 class TileIndependenceTest : public ::libvpx_test::EncoderTest,
-                             public ::libvpx_test::CodecTestWithParam<int> {
+    public ::libvpx_test::CodecTestWithParam<int> {
 protected:
-  TileIndependenceTest()
-      : EncoderTest(GET_PARAM(0)),
-        md5_fw_order_(),
-        md5_inv_order_(),
-        n_tiles_(GET_PARAM(1)) {
+  TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)),
+      md5_fw_order_(), md5_inv_order_() {
    init_flags_ = VPX_CODEC_USE_PSNR;
    vpx_codec_dec_cfg_t cfg;
    cfg.w = 704;
@@ -59,8 +56,9 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,

  void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                 ::libvpx_test::MD5 *md5) {
-    const vpx_codec_err_t res = dec->DecodeFrame(
-        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
+    const vpx_codec_err_t res =
+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                         pkt->data.frame.sz);
    if (res != VPX_CODEC_OK) {
      abort_ = true;
      ASSERT_EQ(VPX_CODEC_OK, res);
@@ -74,11 +72,11 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
    UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
  }

-  ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
-  ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
-
 private:
  int n_tiles_;
+ protected:
+  ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
+  ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
 };

 // run an encode with 2 or 4 tiles, and do the decode both in normal and
@@ -95,7 +93,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
                                     timebase.den, timebase.num, 0, 30);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

-  const char *md5_fw_str = md5_fw_order_.Get();
+  const char *md5_fw_str  = md5_fw_order_.Get();
  const char *md5_inv_str = md5_inv_order_.Get();

  // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
@@ -104,6 +102,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
  ASSERT_STREQ(md5_fw_str, md5_inv_str);
 }

-VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest,
+                          ::testing::Range(0, 2, 1));

 }  // namespace
--- a/test/util.h
+++ b/test/util.h
@@ -37,7 +37,7 @@ static double compute_psnr(const vpx_image_t *img1,
                  img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j];
      sqrerr += d * d;
    }
-  double mse = static_cast<double>(sqrerr) / (width_y * height_y);
+  double mse = sqrerr / (width_y * height_y);
  double psnr = 100.0;
  if (mse > 0.0) {
    psnr = 10 * log10(255.0 * 255.0 / mse);
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -218,7 +218,6 @@ class SubpelVarianceTest :
    vpx_free(src_);
    delete[] ref_;
    vpx_free(sec_);
-    libvpx_test::ClearSystemState();
  }

 protected:
@@ -483,7 +482,6 @@ INSTANTIATE_TEST_CASE_P(
 #endif

 #if HAVE_SSE2
-#if CONFIG_USE_X86INC
 const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
 const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
 const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
@@ -597,11 +595,8 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),
                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
 #endif
-#endif

 #if HAVE_SSSE3
-#if CONFIG_USE_X86INC
-
 const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
    vp9_sub_pixel_variance4x4_ssse3;
 const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
@@ -686,7 +681,6 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
-#endif
 #endif  // CONFIG_VP9_ENCODER

 }  // namespace vp9
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -1,75 +0,0 @@
-/*
-  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-
-  Use of this source code is governed by a BSD-style license
-  that can be found in the LICENSE file in the root of the source
-  tree. An additional intellectual property rights grant can be found
-  in the file PATENTS.  All contributing project authors may
-  be found in the AUTHORS file in the root of the source tree.
-*/
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-const int kMaxPsnr = 100;
-
-class LossLessTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
- protected:
-  LossLessTest() : EncoderTest(GET_PARAM(0)),
-                   psnr_(kMaxPsnr),
-                   nframes_(0),
-                   encoding_mode_(GET_PARAM(1)) {
-  }
-
-  virtual ~LossLessTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-  }
-
-  virtual void BeginPassHook(unsigned int /*pass*/) {
-    psnr_ = 0.0;
-    nframes_ = 0;
-  }
-
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
-    if (pkt->data.psnr.psnr[0] < psnr_)
-      psnr_= pkt->data.psnr.psnr[0];
-  }
-
-  double GetMinPsnr() const {
-      return psnr_;
-  }
-
- private:
-  double psnr_;
-  unsigned int nframes_;
-  libvpx_test::TestMode encoding_mode_;
-};
-
-TEST_P(LossLessTest, TestLossLessEncoding) {
-  const vpx_rational timebase = { 33333333, 1000000000 };
-  cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 0;
-
-  init_flags_ = VPX_CODEC_USE_PSNR;
-
-  // intentionally changed the dimension for better testing coverage
-  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
-                                     timebase.den, timebase.num, 0, 30);
-
-  const double psnr_lossless = GetMinPsnr();
-  EXPECT_GE(psnr_lossless, kMaxPsnr);
-}
-VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES);
-}  // namespace
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -39,7 +39,7 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZE_TYPES;
+  for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
    const int block_width  = 4 << b_width_log2(bsize);
    const int block_height = 4 << b_height_log2(bsize);
@@ -93,8 +93,9 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
                        ::testing::Values(vp9_subtract_block_c));

-#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
                        ::testing::Values(vp9_subtract_block_sse2));
 #endif
+
 }  // namespace vp9
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/decoder/vp9_thread.h"
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/md5_helper.h"
-#include "test/webm_video_source.h"
-
-namespace {
-
-class VP9WorkerThreadTest : public ::testing::Test {
- protected:
-  virtual ~VP9WorkerThreadTest() {}
-  virtual void SetUp() {
-    vp9_worker_init(&worker_);
-  }
-
-  virtual void TearDown() {
-    vp9_worker_end(&worker_);
-  }
-
-  VP9Worker worker_;
-};
-
-int ThreadHook(void* data, void* return_value) {
-  int* const hook_data = reinterpret_cast<int*>(data);
-  *hook_data = 5;
-  return *reinterpret_cast<int*>(return_value);
-}
-
-TEST_F(VP9WorkerThreadTest, HookSuccess) {
-  EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
-
-  for (int i = 0; i < 2; ++i) {
-    EXPECT_TRUE(vp9_worker_reset(&worker_));
-
-    int hook_data = 0;
-    int return_value = 1;  // return successfully from the hook
-    worker_.hook = ThreadHook;
-    worker_.data1 = &hook_data;
-    worker_.data2 = &return_value;
-
-    vp9_worker_launch(&worker_);
-    EXPECT_TRUE(vp9_worker_sync(&worker_));
-    EXPECT_FALSE(worker_.had_error);
-    EXPECT_EQ(5, hook_data);
-
-    EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
-  }
-}
-
-TEST_F(VP9WorkerThreadTest, HookFailure) {
-  EXPECT_TRUE(vp9_worker_reset(&worker_));
-
-  int hook_data = 0;
-  int return_value = 0;  // return failure from the hook
-  worker_.hook = ThreadHook;
-  worker_.data1 = &hook_data;
-  worker_.data2 = &return_value;
-
-  vp9_worker_launch(&worker_);
-  EXPECT_FALSE(vp9_worker_sync(&worker_));
-  EXPECT_TRUE(worker_.had_error);
-
-  // Ensure _reset() clears the error and _launch() can be called again.
-  return_value = 1;
-  EXPECT_TRUE(vp9_worker_reset(&worker_));
-  EXPECT_FALSE(worker_.had_error);
-  vp9_worker_launch(&worker_);
-  EXPECT_TRUE(vp9_worker_sync(&worker_));
-  EXPECT_FALSE(worker_.had_error);
-}
-
-TEST(VP9DecodeMTTest, MTDecode) {
-  libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm");
-  video.Init();
-
-  vpx_codec_dec_cfg_t cfg = {0};
-  cfg.threads = 2;
-  libvpx_test::VP9Decoder decoder(cfg, 0);
-
-  libvpx_test::MD5 md5;
-  for (video.Begin(); video.cxdata(); video.Next()) {
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img = NULL;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      md5.Add(img);
-    }
-  }
-  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get());
-}
-
-}  // namespace
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -99,7 +99,7 @@ class WebMVideoSource : public CompressedVideoSource {

  virtual void Begin() {
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
        << file_name_;

    nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
@@ -130,7 +130,6 @@ class WebMVideoSource : public CompressedVideoSource {
  }

  void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
    if (chunk_ >= chunks_) {
      unsigned int track;

--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@@ -1370,12 +1370,12 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    mov        edx, [esp + 8 + 12]  // src_stride
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    shr        eax, 1
    cmp        eax, 0
    je         xloop1
-    cmp        eax, 64
+    cmp        eax, 128
    je         xloop2

+    shr        eax, 1
    mov        ah,al
    neg        al
    add        al, 128
@@ -2132,11 +2132,11 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "mov    0x14(%esp),%edx                    \n"
    "mov    0x18(%esp),%ecx                    \n"
    "mov    0x1c(%esp),%eax                    \n"
-    "shr    %eax                               \n"
    "cmp    $0x0,%eax                          \n"
    "je     2f                                 \n"
-    "cmp    $0x40,%eax                         \n"
+    "cmp    $0x80,%eax                         \n"
    "je     3f                                 \n"
+    "shr    %eax                               \n"
    "mov    %al,%ah                            \n"
    "neg    %al                                \n"
    "add    $0x80,%al                          \n"
@@ -2662,7 +2662,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                  const uint8* src_ptr, int src_stride,
                                  int dst_width, int source_y_fraction) {
-  source_y_fraction >>= 1;
  if (source_y_fraction == 0) {
    asm volatile (
   "1:"
@@ -2681,7 +2680,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
      : "memory", "cc", "rax"
    );
    return;
-  } else if (source_y_fraction == 64) {
+  } else if (source_y_fraction == 128) {
    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
@@ -2704,6 +2703,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
  } else {
    asm volatile (
      "mov        %3,%%eax                     \n"
+      "shr        %%eax                        \n"
      "mov        %%al,%%ah                    \n"
      "neg        %%al                         \n"
      "add        $0x80,%%al                   \n"
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -173,6 +173,7 @@ void vp8_create_common(VP8_COMMON *oci)
    oci->use_bilinear_mc_filter = 0;
    oci->full_pixel = 0;
    oci->multi_token_partition = ONE_PARTITION;
+    oci->clr_type = REG_YUV;
    oci->clamp_type = RECON_CLAMP_REQUIRED;

    /* Initialize reference frame sign bias structure to defaults */
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -72,6 +72,7 @@ typedef struct VP8Common
    int horiz_scale;
    int vert_scale;

+    YUV_TYPE clr_type;
    CLAMP_TYPE  clamp_type;

    YV12_BUFFER_CONFIG *frame_to_show;
@@ -114,6 +115,9 @@ typedef struct VP8Common
    int uvdc_delta_q;
    int uvac_delta_q;

+    unsigned int frames_since_golden;
+    unsigned int frames_till_alt_ref_frame;
+
    /* We allocate a MODE_INFO struct for each macroblock, together with
       an extra row on top and column on the left to simplify prediction. */

@@ -153,6 +157,7 @@ typedef struct VP8Common

    unsigned int current_video_frame;

+    int near_boffset[3];
    int version;

    TOKEN_PARTITION multi_token_partition;
@@ -160,10 +165,8 @@ typedef struct VP8Common
 #ifdef PACKET_TESTING
    VP8_HEADER oh;
 #endif
-#if CONFIG_POSTPROC_VISUALIZER
    double bitrate;
    double framerate;
-#endif

 #if CONFIG_MULTITHREAD
    int processor_core_count;
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -923,7 +923,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
    {
        char message[512];
-        sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate);
+        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }

--- a/vp8/common/vp8_asm_com_offsets.c
+++ b/vp8/common/vp8_asm_com_offsets.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vp8/common/blockd.h"
+
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif /* CONFIG_POSTPROC */
+
+BEGIN
+
+#if CONFIG_POSTPROC
+/* mfqe.c / filter_by_weight */
+DEFINE(MFQE_PRECISION_VAL,                      MFQE_PRECISION);
+#endif /* CONFIG_POSTPROC */
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_MEDIA
+/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
+ct_assert(B_DC_PRED, B_DC_PRED == 0);
+ct_assert(B_TM_PRED, B_TM_PRED == 1);
+ct_assert(B_VE_PRED, B_VE_PRED == 2);
+ct_assert(B_HE_PRED, B_HE_PRED == 3);
+ct_assert(B_LD_PRED, B_LD_PRED == 4);
+ct_assert(B_RD_PRED, B_RD_PRED == 5);
+ct_assert(B_VR_PRED, B_VR_PRED == 6);
+ct_assert(B_VL_PRED, B_VL_PRED == 7);
+ct_assert(B_HD_PRED, B_HD_PRED == 8);
+ct_assert(B_HU_PRED, B_HU_PRED == 9);
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_POSTPROC
+/* vp8_filter_by_weight16x16 and 8x8 */
+ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
+#endif /* CONFIG_POSTPROC */
+#endif /* HAVE_SSE2 */
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -1095,7 +1095,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate bool decoder 0");
    if (pc->frame_type == KEY_FRAME) {
-        (void)vp8_read_bit(bc);  // colorspace
+        pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
        pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
    }

--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -430,6 +430,7 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st
    *time_stamp = pbi->last_time_stamp;
    *time_end_stamp = 0;

+    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
--- a/vp8/decoder/vp8_asm_dec_offsets.c
+++ b/vp8/decoder/vp8_asm_dec_offsets.c
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "onyxd_int.h"
+
+BEGIN
+
+DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
+DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1322,7 +1322,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
        vp8_start_encode(bc, cx_data, cx_data_end);

        /* signal clr type */
-        vp8_write_bit(bc, 0);
+        vp8_write_bit(bc, pc->clr_type);
        vp8_write_bit(bc, pc->clamp_type);

    }
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1325,7 +1325,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
    return Q;
 }

-extern void vp8_new_framerate(VP8_COMP *cpi, double framerate);
+extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);

 void vp8_init_second_pass(VP8_COMP *cpi)
 {
@@ -1349,9 +1349,9 @@ void vp8_init_second_pass(VP8_COMP *cpi)
     * sum duration is not. Its calculated based on the actual durations of
     * all frames from the first pass.
     */
-    vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);

-    cpi->output_framerate = cpi->framerate;
+    cpi->output_frame_rate = cpi->frame_rate;
    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);

@@ -2398,7 +2398,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    target_frame_size += cpi->min_frame_bandwidth;

    /* Every other frame gets a few extra bits */
-    if ( (cpi->frames_since_golden & 0x01) &&
+    if ( (cpi->common.frames_since_golden & 0x01) &&
         (cpi->frames_till_gf_update_due > 0) )
    {
        target_frame_size += cpi->twopass.alt_extra_bits;
@@ -2529,7 +2529,7 @@ void vp8_second_pass(VP8_COMP *cpi)

    /* Set nominal per second bandwidth for this frame */
    cpi->target_bandwidth = (int)
-    (cpi->per_frame_bandwidth * cpi->output_framerate);
+    (cpi->per_frame_bandwidth * cpi->output_frame_rate);
    if (cpi->target_bandwidth < 0)
        cpi->target_bandwidth = 0;

@@ -3185,7 +3185,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        /* Convert to a per second bitrate */
        cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
-                                      cpi->output_framerate);
+                                      cpi->output_frame_rate);
    }

    /* Note the total error score of the kf group minus the key frame itself */
@@ -3224,7 +3224,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        cpi->common.vert_scale = NORMAL;

        /* Calculate Average bits per frame. */
-        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate);
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);

        /* CBR... Use the clip average as the target for deciding resample */
        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -3299,7 +3299,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }
        else
        {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
            int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;

            /* If triggered last time the threshold for triggering again is
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -301,11 +301,11 @@ static int rescale(int val, int num, int denom)
 static void init_temporal_layer_context(VP8_COMP *cpi,
                                        VP8_CONFIG *oxcf,
                                        const int layer,
-                                        double prev_layer_framerate)
+                                        double prev_layer_frame_rate)
 {
    LAYER_CONTEXT *lc = &cpi->layer_context[layer];

-    lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
+    lc->frame_rate = cpi->output_frame_rate / cpi->oxcf.rate_decimator[layer];
    lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;

    lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
@@ -335,7 +335,7 @@ static void init_temporal_layer_context(VP8_COMP *cpi,
      lc->avg_frame_size_for_layer =
          (int)((cpi->oxcf.target_bitrate[layer] -
                cpi->oxcf.target_bitrate[layer-1]) * 1000 /
-                (lc->framerate - prev_layer_framerate));
+                (lc->frame_rate - prev_layer_frame_rate));

     lc->active_worst_quality         = cpi->oxcf.worst_allowed_q;
     lc->active_best_quality          = cpi->oxcf.best_allowed_q;
@@ -363,7 +363,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
                                        const int prev_num_layers)
 {
    int i;
-    double prev_layer_framerate = 0;
+    double prev_layer_frame_rate = 0;
    const int curr_num_layers = cpi->oxcf.number_of_layers;
    // If the previous state was 1 layer, get current layer context from cpi.
    // We need this to set the layer context for the new layers below.
@@ -377,7 +377,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
        LAYER_CONTEXT *lc = &cpi->layer_context[i];
        if (i >= prev_num_layers)
        {
-           init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+           init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
        }
        // The initial buffer levels are set based on their starting levels.
        // We could set the buffer levels based on the previous state (normalized
@@ -403,8 +403,8 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
            lc->bits_off_target = lc->buffer_level;
            restore_layer_context(cpi, 0);
        }
-        prev_layer_framerate = cpi->output_framerate /
-                               cpi->oxcf.rate_decimator[i];
+        prev_layer_frame_rate =  cpi->output_frame_rate /
+                                 cpi->oxcf.rate_decimator[i];
    }
 }

@@ -1282,21 +1282,21 @@ int vp8_reverse_trans(int x)

    return 63;
 }
-void vp8_new_framerate(VP8_COMP *cpi, double framerate)
+void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
 {
    if(framerate < .1)
        framerate = 30;

-    cpi->framerate              = framerate;
-    cpi->output_framerate       = framerate;
+    cpi->frame_rate             = framerate;
+    cpi->output_frame_rate      = framerate;
    cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
-                                  cpi->output_framerate);
+                                  cpi->output_frame_rate);
    cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
    cpi->min_frame_bandwidth    = (int)(cpi->av_per_frame_bandwidth *
                                  cpi->oxcf.two_pass_vbrmin_section / 100);

    /* Set Maximum gf/arf interval */
-    cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2);
+    cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);

    if(cpi->max_gf_interval < 12)
        cpi->max_gf_interval = 12;
@@ -1337,13 +1337,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     * seems like a reasonable framerate, then use that as a guess, otherwise
     * use 30.
     */
-    cpi->framerate = (double)(oxcf->timebase.den) /
-                     (double)(oxcf->timebase.num);
+    cpi->frame_rate = (double)(oxcf->timebase.den) /
+                      (double)(oxcf->timebase.num);

-    if (cpi->framerate > 180)
-        cpi->framerate = 30;
+    if (cpi->frame_rate > 180)
+        cpi->frame_rate = 30;

-    cpi->ref_framerate = cpi->framerate;
+    cpi->ref_frame_rate = cpi->frame_rate;

    /* change includes all joint functionality */
    vp8_change_config(cpi, oxcf);
@@ -1369,13 +1369,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
    if (cpi->oxcf.number_of_layers > 1)
    {
        unsigned int i;
-        double prev_layer_framerate=0;
+        double prev_layer_frame_rate=0;

        for (i=0; i<cpi->oxcf.number_of_layers; i++)
        {
-            init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
-            prev_layer_framerate = cpi->output_framerate /
-                                   cpi->oxcf.rate_decimator[i];
+            init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
+            prev_layer_frame_rate = cpi->output_frame_rate /
+                                    cpi->oxcf.rate_decimator[i];
        }
    }

@@ -1399,14 +1399,14 @@ static void update_layer_contexts (VP8_COMP *cpi)
    if (oxcf->number_of_layers > 1)
    {
        unsigned int i;
-        double prev_layer_framerate=0;
+        double prev_layer_frame_rate=0;

        for (i=0; i<oxcf->number_of_layers; i++)
        {
            LAYER_CONTEXT *lc = &cpi->layer_context[i];

-            lc->framerate =
-                cpi->ref_framerate / oxcf->rate_decimator[i];
+            lc->frame_rate =
+                cpi->ref_frame_rate / oxcf->rate_decimator[i];
            lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;

            lc->starting_buffer_level = rescale(
@@ -1432,9 +1432,9 @@ static void update_layer_contexts (VP8_COMP *cpi)
                lc->avg_frame_size_for_layer =
                   (int)((oxcf->target_bitrate[i] -
                          oxcf->target_bitrate[i-1]) * 1000 /
-                          (lc->framerate - prev_layer_framerate));
+                          (lc->frame_rate - prev_layer_frame_rate));

-            prev_layer_framerate = lc->framerate;
+            prev_layer_frame_rate = lc->frame_rate;
        }
    }
 }
@@ -1625,7 +1625,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
                    cpi->oxcf.target_bandwidth, 1000);

    /* Set up frame rate and related parameters rate control values. */
-    vp8_new_framerate(cpi, cpi->framerate);
+    vp8_new_frame_rate(cpi, cpi->frame_rate);

    /* Set absolute upper and lower quality limits */
    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
@@ -1945,7 +1945,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)

    for (i = 0; i < KEY_FRAME_CONTEXT; i++)
    {
-        cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
+        cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
    }

 #ifdef OUTPUT_YUV_SRC
@@ -2273,7 +2273,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
        {
            extern int count_mb_seg[4];
            FILE *f = fopen("modes.stt", "a");
-            double dr = (double)cpi->framerate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
            fprintf(f, "intra_mode in Intra Frames:\n");
            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2750,7 +2750,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

    /* this frame refreshes means next frames don't unless specified by user */
-    cpi->frames_since_golden = 0;
+    cpi->common.frames_since_golden = 0;

    /* Clear the alternate reference update pending flag. */
    cpi->source_alt_ref_pending = 0;
@@ -2802,7 +2802,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
         * user
         */
        cm->refresh_golden_frame = 0;
-        cpi->frames_since_golden = 0;
+        cpi->common.frames_since_golden = 0;

        cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
        cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
@@ -2834,12 +2834,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
        if (cpi->frames_till_gf_update_due > 0)
            cpi->frames_till_gf_update_due--;

-        if (cpi->frames_till_alt_ref_frame)
-            cpi->frames_till_alt_ref_frame --;
+        if (cpi->common.frames_till_alt_ref_frame)
+            cpi->common.frames_till_alt_ref_frame --;

-        cpi->frames_since_golden ++;
+        cpi->common.frames_since_golden ++;

-        if (cpi->frames_since_golden > 1)
+        if (cpi->common.frames_since_golden > 1)
        {
            cpi->recent_ref_frame_usage[INTRA_FRAME] +=
                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
@@ -2890,11 +2890,11 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
            cpi->prob_last_coded = 200;
            cpi->prob_gf_coded = 1;
        }
-        else if (cpi->frames_since_golden == 0)
+        else if (cpi->common.frames_since_golden == 0)
        {
            cpi->prob_last_coded = 214;
        }
-        else if (cpi->frames_since_golden == 1)
+        else if (cpi->common.frames_since_golden == 1)
        {
            cpi->prob_last_coded = 192;
            cpi->prob_gf_coded = 220;
@@ -3368,12 +3368,12 @@ static void encode_frame_to_data_rate
            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
            /* per second target bitrate */
            cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
-                                          cpi->output_framerate);
+                                          cpi->output_frame_rate);
        }
    }
    else
 #endif
-        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_framerate);
+        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_frame_rate);

    /* Default turn off buffer to buffer copying */
    cm->copy_buffer_to_gf = 0;
@@ -4557,7 +4557,7 @@ static void encode_frame_to_data_rate
        {
            LAYER_CONTEXT *lc = &cpi->layer_context[i];
            int bits_off_for_this_layer =
-               (int)(lc->target_bandwidth / lc->framerate -
+               (int)(lc->target_bandwidth / lc->frame_rate -
                     cpi->projected_frame_size);

            lc->bits_off_target += bits_off_for_this_layer;
@@ -4805,7 +4805,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
    {
        double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
            *cpi->oxcf.two_pass_vbrmin_section / 100);
-        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
    }
 }
 #endif
@@ -4821,10 +4821,8 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
 {
 #if HAVE_NEON
    int64_t store_reg[8];
-#if CONFIG_RUNTIME_CPU_DETECT
+#endif
    VP8_COMMON            *cm = &cpi->common;
-#endif
-#endif
    struct vpx_usec_timer  timer;
    int                    res = 0;

@@ -4850,6 +4848,7 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
    if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                          frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
        res = -1;
+    cm->clr_type = sd->clrtype;
    vpx_usec_timer_mark(&timer);
    cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);

@@ -4934,7 +4933,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                                              cpi->frames_till_gf_update_due);
                force_src_buffer = &cpi->alt_ref_buffer;
            }
-            cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+            cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
            cm->refresh_alt_ref_frame = 1;
            cm->refresh_golden_frame = 0;
            cm->refresh_last_frame = 0;
@@ -5039,7 +5038,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
        if (this_duration)
        {
            if (step)
-                cpi->ref_framerate = 10000000.0 / this_duration;
+                cpi->ref_frame_rate = 10000000.0 / this_duration;
            else
            {
                double avg_duration, interval;
@@ -5053,11 +5052,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                if(interval > 10000000.0)
                    interval = 10000000;

-                avg_duration = 10000000.0 / cpi->ref_framerate;
+                avg_duration = 10000000.0 / cpi->ref_frame_rate;
                avg_duration *= (interval - avg_duration + this_duration);
                avg_duration /= interval;

-                cpi->ref_framerate = 10000000.0 / avg_duration;
+                cpi->ref_frame_rate = 10000000.0 / avg_duration;
            }

            if (cpi->oxcf.number_of_layers > 1)
@@ -5068,12 +5067,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                for (i=0; i<cpi->oxcf.number_of_layers; i++)
                {
                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
-                    lc->framerate = cpi->ref_framerate /
-                                    cpi->oxcf.rate_decimator[i];
+                    lc->frame_rate = cpi->ref_frame_rate /
+                                  cpi->oxcf.rate_decimator[i];
                }
            }
            else
-                vp8_new_framerate(cpi, cpi->ref_framerate);
+                vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
        }

        cpi->last_time_stamp_seen = cpi->source->ts_start;
@@ -5090,7 +5089,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
        layer = cpi->oxcf.layer_id[
                cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
        restore_layer_context (cpi, layer);
-        vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
+        vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
    }

    if (cpi->compressor_speed == 2)
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -232,7 +232,7 @@ enum
 typedef struct
 {
    /* Layer configuration */
-    double framerate;
+    double frame_rate;
    int target_bandwidth;

    /* Layer specific coding parameters */
@@ -320,7 +320,6 @@ typedef struct VP8_COMP
    YV12_BUFFER_CONFIG scaled_source;
    YV12_BUFFER_CONFIG *last_frame_unscaled_source;

-    unsigned int frames_till_alt_ref_frame;
    /* frame in src_buffers has been identified to be encoded as an alt ref */
    int source_alt_ref_pending;
    /* an alt ref frame has been encoded and is usable */
@@ -370,7 +369,6 @@ typedef struct VP8_COMP
    double key_frame_rate_correction_factor;
    double gf_rate_correction_factor;

-    unsigned int frames_since_golden;
    /* Count down till next GF */
    int frames_till_gf_update_due;

@@ -403,7 +401,7 @@ typedef struct VP8_COMP
    /* Minimum allocation that should be used for any frame */
    int min_frame_bandwidth;
    int inter_frame_target;
-    double output_framerate;
+    double output_frame_rate;
    int64_t last_time_stamp_seen;
    int64_t last_end_time_stamp_seen;
    int64_t first_time_stamp_ever;
@@ -417,8 +415,8 @@ typedef struct VP8_COMP

    int buffered_mode;

-    double framerate;
-    double ref_framerate;
+    double frame_rate;
+    double ref_frame_rate;
    int64_t buffer_level;
    int64_t bits_off_target;

--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -234,7 +234,7 @@ void vp8_save_coding_context(VP8_COMP *cpi)
    cc->frames_since_key          = cpi->frames_since_key;
    cc->filter_level             = cpi->common.filter_level;
    cc->frames_till_gf_update_due   = cpi->frames_till_gf_update_due;
-    cc->frames_since_golden       = cpi->frames_since_golden;
+    cc->frames_since_golden       = cpi->common.frames_since_golden;

    vp8_copy(cc->mvc,      cpi->common.fc.mvc);
    vp8_copy(cc->mvcosts,  cpi->rd_costs.mvcosts);
@@ -271,7 +271,7 @@ void vp8_restore_coding_context(VP8_COMP *cpi)
    cpi->frames_since_key         =   cc->frames_since_key;
    cpi->common.filter_level     =   cc->filter_level;
    cpi->frames_till_gf_update_due  =   cc->frames_till_gf_update_due;
-    cpi->frames_since_golden       =   cc->frames_since_golden;
+    cpi->common.frames_since_golden       =   cc->frames_since_golden;

    vp8_copy(cpi->common.fc.mvc, cc->mvc);

@@ -388,7 +388,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
        int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
        /* Boost depends somewhat on frame rate: only used for 1 layer case. */
        if (cpi->oxcf.number_of_layers == 1) {
-          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
        }
        else {
          /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
@@ -399,9 +399,9 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
        kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;

        /* frame separation adjustment ( down) */
-        if (cpi->frames_since_key  < cpi->output_framerate / 2)
+        if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
            kf_boost = (int)(kf_boost
-                       * cpi->frames_since_key / (cpi->output_framerate / 2));
+                       * cpi->frames_since_key / (cpi->output_frame_rate / 2));

        /* Minimal target size is |2* per_frame_bandwidth|. */
        if (kf_boost < 16)
@@ -715,7 +715,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
                if (Adjustment > (cpi->this_frame_target - min_frame_target))
                    Adjustment = (cpi->this_frame_target - min_frame_target);

-                if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
+                if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
                    cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
                else
                    cpi->this_frame_target -= Adjustment;
@@ -1360,7 +1360,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
         * whichever is smaller.
         */
        int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
-        av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2;
+        av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2;

        if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
            av_key_frame_frequency = key_freq;
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -341,7 +341,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)

 void vp8_auto_select_speed(VP8_COMP *cpi)
 {
-    int milliseconds_for_compress = (int)(1000000 / cpi->framerate);
+    int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);

    milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;

--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -66,6 +66,7 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
 VP8_COMMON_SRCS-yes += common/variance_c.c
 VP8_COMMON_SRCS-yes += common/variance.h
+VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h


@@ -191,4 +192,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(A
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)

+$(eval $(call asm_offsets_template,\
+         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c))
+
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -695,6 +695,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
    yv12->uv_stride = img->stride[VPX_PLANE_U];

    yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
    return res;
 }

@@ -1078,7 +1079,11 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
        ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
        ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;

-        ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+        if (sd.clrtype == REG_YUV)
+            ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+        else
+            ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
+
        ctx->preview_img.x_chroma_shift = 1;
        ctx->preview_img.y_chroma_shift = 1;

--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -41,6 +41,15 @@ typedef enum

 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);

+typedef struct
+{
+    unsigned int   id;
+    unsigned long  sz;
+    unsigned int   align;
+    unsigned int   flags;
+    unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
 static const mem_req_t vp8_mem_req_segs[] =
 {
    {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
@@ -84,6 +93,65 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
    return sizeof(vpx_codec_alg_priv_t);
 }

+
+static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap)
+{
+    free(mmap->priv);
+}
+
+static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap)
+{
+    vpx_codec_err_t  res;
+    unsigned int   align;
+
+    align = mmap->align ? mmap->align - 1 : 0;
+
+    if (mmap->flags & VPX_CODEC_MEM_ZERO)
+        mmap->priv = calloc(1, mmap->sz + align);
+    else
+        mmap->priv = malloc(mmap->sz + align);
+
+    res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
+    mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+    mmap->dtor = vp8_mmap_dtor;
+    return res;
+}
+
+static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
+        const vpx_codec_mmap_t        *mmaps,
+        vpx_codec_flags_t              init_flags)
+{
+    int i;
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++)
+    {
+        /* Ensure the segment has been allocated */
+        if (!mmaps[i].base)
+        {
+            res = VPX_CODEC_MEM_ERROR;
+            break;
+        }
+
+        /* Verify variable size segment is big enough for the current si. */
+        if (vp8_mem_req_segs[i].calc_sz)
+        {
+            vpx_codec_dec_cfg_t cfg;
+
+            cfg.w = si->w;
+            cfg.h = si->h;
+
+            if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags))
+            {
+                res = VPX_CODEC_MEM_ERROR;
+                break;
+            }
+        }
+    }
+
+    return res;
+}
+
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
 {
    int i;
@@ -110,6 +178,16 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
    }
 }

+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
+{
+    int i;
+
+    for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
+        if (ctx->mmaps[i].id == id)
+            return ctx->mmaps[i].base;
+
+    return NULL;
+}
 static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
 {
    /* nothing to clean up */
@@ -136,7 +214,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
        mmap.align = vp8_mem_req_segs[0].align;
        mmap.flags = vp8_mem_req_segs[0].flags;

-        res = vpx_mmap_alloc(&mmap);
+        res = vp8_mmap_alloc(&mmap);
        if (res != VPX_CODEC_OK) return res;

        vp8_init_ctx(ctx, &mmap);
@@ -288,7 +366,8 @@ static void yuvconfig2image(vpx_image_t               *img,
      * the Y, U, and V planes, nor other alignment adjustments that
      * might be representable by a YV12_BUFFER_CONFIG, so we just
      * initialize all the fields.*/
-    img->fmt = VPX_IMG_FMT_I420;
+    img->fmt = yv12->clrtype == REG_YUV ?
+        VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
    img->w = yv12->y_stride;
    img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
    img->d_w = yv12->y_width;
@@ -409,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
                                   ctx->base.init_flags);

-            res = vpx_mmap_alloc(&ctx->mmaps[i]);
+            res = vp8_mmap_alloc(&ctx->mmaps[i]);
        }

        if (!res)
@@ -421,9 +500,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    /* Initialize the decoder instance on the first frame*/
    if (!res && !ctx->decoder_init)
    {
-        res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
-                                 vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs),
-                                 ctx->base.init_flags);
+        res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);

        if (!res)
        {
@@ -720,6 +797,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
    yv12->uv_stride = img->stride[VPX_PLANE_U];

    yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
+
    return res;
 }

--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -35,5 +35,9 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c

 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
+
+$(eval $(call asm_offsets_template,\
+         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c))
--- a/vp8_scalable_patterns.c
+++ b/vp8_scalable_patterns.c
@@ -18,7 +18,6 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string.h>
-#include <time.h>
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
@@ -138,8 +137,6 @@ int main(int argc, char **argv) {
    int                  layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
    int                  flag_periodicity;
    int                  max_intra_size_pct;
-    clock_t              before;
-    clock_t              after;

    /* Check usage and arguments */
    if (argc < 9)
@@ -642,7 +639,6 @@ int main(int argc, char **argv) {
    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                      max_intra_size_pct);

-    before = clock();
    frame_avail = 1;
    while (frame_avail || got_data) {
        vpx_codec_iter_t iter = NULL;
@@ -664,8 +660,8 @@ int main(int argc, char **argv) {
            got_data = 1;
            switch (pkt->kind) {
            case VPX_CODEC_CX_FRAME_PKT:
-                for (i = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
-                                              i < cfg.ts_number_layers; i++)
+                for (i=cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
+                                              i<cfg.ts_number_layers; i++)
                {
                    write_ivf_frame_header(outfile[i], pkt);
                    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
@@ -680,13 +676,9 @@ int main(int argc, char **argv) {
        frame_cnt++;
        pts += frame_duration;
    }
-    after = clock();
-
-    printf("Processed %d frames in %ld ms.\n", frame_cnt-1,
-           (int) (after - before) / (CLOCKS_PER_SEC / 1000));
-
    fclose (infile);

+    printf ("Processed %d frames.\n",frame_cnt-1);
    if (vpx_codec_destroy(&codec))
            die_codec (&codec, "Failed to destroy codec");

--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -1,258 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vp9_convolve8_avg_horiz_neon|
-    EXPORT  |vp9_convolve8_avg_vert_neon|
-    IMPORT  |vp9_convolve8_avg_horiz_c|
-    IMPORT  |vp9_convolve8_avg_vert_c|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|vp9_convolve8_avg_horiz_neon| PROC
-    ldr             r12, [sp, #4]           ; x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_horiz_c
-
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
-
-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
-
-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    ; extract to s16
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
-
-    ; slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-    
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vp9_convolve8_avg_vert_neon| PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_vert_c
-
-    push            {r4-r10, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r7]              ; filter_y
-
-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
-
-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
-
-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r8                 ; w loop counter
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
-
-    ; extract to s16
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
-
-    subs            r8, r8, #4              ; w -= 4
-    bgt             loop_vert
-
-    ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
-
-    pop             {r4-r10, pc}
-
-    ENDP
-    END
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vp9_convolve8_horiz_neon|
-    EXPORT  |vp9_convolve8_vert_neon|
-    IMPORT  |vp9_convolve8_horiz_c|
-    IMPORT  |vp9_convolve8_vert_c|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|vp9_convolve8_horiz_neon| PROC
-    ldr             r12, [sp, #4]           ; x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_horiz_c
-
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
-
-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
-
-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    ; extract to s16
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vp9_convolve8_vert_neon| PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_vert_c
-
-    push            {r4-r10, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r7]              ; filter_y
-
-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
-
-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
-
-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r8                 ; w loop counter
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
-
-    ; extract to s16
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
-
-    subs            r8, r8, #4              ; w -= 4
-    bgt             loop_vert
-
-    ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
-
-    pop             {r4-r10, pc}
-
-    ENDP
-    END
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -1,77 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  uint8_t temp[64 * 72];
-
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  int intermediate_height = h + 7;
-
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the
-   * given height and filter a multiple of 4 lines. Since this goes in to
-   * the temp buffer which has lots of extra room and is subsequently discarded
-   * this is safe if somewhat less than ideal.
-   */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
-                          dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-}
-
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
-  uint8_t temp[64 * 72];
-  int intermediate_height = h + 7;
-
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_avg_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
-  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
-                              64, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-}
--- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
@@ -1,69 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
-;                            uint8_t *dst_ptr, int pitch, int stride)
-;
-; r0  int input_dc
-; r1  uint8_t *pred_ptr
-; r2  uint8_t *dst_ptr
-; r3  int pitch
-; sp  int stride
-
-|vp9_dc_only_idct_add_neon| PROC
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    mul              r0, r0, r12               ; input_dc * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.16         q0, r0;                   ; duplicate a1
-    ldr              r12, [sp]                 ; load stride
-
-    vld1.32         {d2[0]}, [r1], r3
-    vld1.32         {d2[1]}, [r1], r3
-    vld1.32         {d4[0]}, [r1], r3
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2                ; a1 + pred_ptr[c]
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1                    ; clip_pixel
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r2], r12
-    vst1.32         {d2[1]}, [r2], r12
-    vst1.32         {d4[0]}, [r2], r12
-    vst1.32         {d4[1]}, [r2]
-
-    bx               lr
-    ENDP             ; |vp9_dc_only_idct_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -1,708 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_loop_filter_horizontal_edge_neon|
-    EXPORT  |vp9_loop_filter_vertical_edge_neon|
-    EXPORT  |vp9_mbloop_filter_horizontal_edge_neon|
-    EXPORT  |vp9_mbloop_filter_vertical_edge_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
-;                                           int p /* pitch */,
-;                                           const uint8_t *blimit,
-;                                           const uint8_t *limit,
-;                                           const uint8_t *thresh,
-;                                           int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_loop_filter_horizontal_edge_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #8]              ; load count
-    ldr         r2, [sp, #4]               ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    cmp         r12, #0
-    beq         end_vp9_lf_h_edge
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-count_lf_h_loop
-    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r3, r2, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r2@64], r1          ; p3
-    vld1.u8     {d4}, [r3@64], r1          ; p2
-    vld1.u8     {d5}, [r2@64], r1          ; p1
-    vld1.u8     {d6}, [r3@64], r1          ; p0
-    vld1.u8     {d7}, [r2@64], r1          ; q0
-    vld1.u8     {d16}, [r3@64], r1         ; q1
-    vld1.u8     {d17}, [r2@64]             ; q2
-    vld1.u8     {d18}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          vp9_loop_filter_neon
-
-    vst1.u8     {d4}, [r2@64], r1          ; store op1
-    vst1.u8     {d5}, [r3@64], r1          ; store op0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq0
-    vst1.u8     {d7}, [r3@64], r1          ; store oq1
-
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_lf_h_loop
-
-end_vp9_lf_h_edge
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_horizontal_edge_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
-;                                         int p /* pitch */,
-;                                         const uint8_t *blimit,
-;                                         const uint8_t *limit,
-;                                         const uint8_t *thresh,
-;                                         int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_loop_filter_vertical_edge_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #8]             ; load count
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #4]              ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vp9_lf_v_edge
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-count_lf_v_loop
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    bl          vp9_loop_filter_neon
-
-    sub         r0, r0, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
-    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
-    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
-    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
-    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
-    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
-    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
-    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_lf_v_loop
-
-end_vp9_lf_v_edge
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_vertical_edge_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d4    op1
-; d5    op0
-; d6    oq0
-; d7    oq1
-|vp9_loop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
-
-    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
-
-    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
-
-    vmov.u8     d18, #0x80
-
-    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
-
-    ; hevmask
-    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
-
-    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
-    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
-
-    veor        d7, d7, d18                 ; qs0
-
-    vcge.u8     d23, d1, d23                ; abs(m1) > limit
-
-    ; filter() function
-    ; convert to signed
-
-    vshr.u8     d28, d28, #1                ; a = a / 2
-    veor        d6, d6, d18                 ; ps0
-
-    veor        d5, d5, d18                 ; ps1
-    vqadd.u8    d17, d17, d28               ; a = b + a
-
-    veor        d16, d16, d18               ; qs1
-
-    vmov.u8     d19, #3
-
-    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
-
-    vcge.u8     d17, d0, d17                ; a > blimit
-
-    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
-    vorr        d22, d21, d22               ; hevmask
-
-    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
-
-    vand        d27, d27, d22               ; filter &= hev
-    vand        d23, d23, d17               ; filter_mask
-
-    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d17, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d27, q12
-
-    vand        d27, d27, d23               ; filter &= mask
-
-    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
-    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
-    vshr.s8     d28, d28, #3                ; filter2 >>= 3
-    vshr.s8     d27, d27, #3                ; filter1 >>= 3
-
-    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
-    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
-
-    veor        d6, d26, d18                ; *oq0 = u^0x80
-
-    vbic        d27, d27, d22               ; filter &= ~hev
-
-    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
-    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
-
-    veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d4, d21, d18                ; *op1 = u^0x80
-    veor        d7, d20, d18                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
-
-; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
-;                                             const uint8_t *blimit,
-;                                             const uint8_t *limit,
-;                                             const uint8_t *thresh,
-;                                             int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_mbloop_filter_horizontal_edge_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #16]             ; load count
-    ldr         r2, [sp, #12]              ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    cmp         r12, #0
-    beq         end_vp9_mblf_h_edge
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-count_mblf_h_loop
-    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r2, r3, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r3@64], r1          ; p3
-    vld1.u8     {d4}, [r2@64], r1          ; p2
-    vld1.u8     {d5}, [r3@64], r1          ; p1
-    vld1.u8     {d6}, [r2@64], r1          ; p0
-    vld1.u8     {d7}, [r3@64], r1          ; q0
-    vld1.u8     {d16}, [r2@64], r1         ; q1
-    vld1.u8     {d17}, [r3@64]             ; q2
-    vld1.u8     {d18}, [r2@64], r1         ; q3
-
-    sub         r3, r3, r1, lsl #1
-    sub         r2, r2, r1, lsl #2
-
-    bl          vp9_mbloop_filter_neon
-
-    vst1.u8     {d0}, [r2@64], r1          ; store op2
-    vst1.u8     {d1}, [r3@64], r1          ; store op1
-    vst1.u8     {d2}, [r2@64], r1          ; store op0
-    vst1.u8     {d3}, [r3@64], r1          ; store oq0
-    vst1.u8     {d4}, [r2@64], r1          ; store oq1
-    vst1.u8     {d5}, [r3@64], r1          ; store oq2
-
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_mblf_h_loop
-
-end_vp9_mblf_h_edge
-    pop         {r4-r5, pc}
-
-    ENDP        ; |vp9_mbloop_filter_horizontal_edge_neon|
-
-; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
-;                                           int pitch,
-;                                           const uint8_t *blimit,
-;                                           const uint8_t *limit,
-;                                           const uint8_t *thresh,
-;                                           int count)
-;
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_mbloop_filter_vertical_edge_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #16]            ; load count
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #12]             ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vp9_mblf_v_edge
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-count_mblf_v_loop
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    sub         r2, r0, #3
-    add         r3, r0, #1
-
-    bl          vp9_mbloop_filter_neon
-
-    ;store op2, op1, op0, oq0
-    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
-    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
-    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
-    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
-    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
-    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
-    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
-    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
-
-    ;store oq1, oq2
-    vst2.8      {d4[0], d5[0]}, [r3], r1
-    vst2.8      {d4[1], d5[1]}, [r3], r1
-    vst2.8      {d4[2], d5[2]}, [r3], r1
-    vst2.8      {d4[3], d5[3]}, [r3], r1
-    vst2.8      {d4[4], d5[4]}, [r3], r1
-    vst2.8      {d4[5], d5[5]}, [r3], r1
-    vst2.8      {d4[6], d5[6]}, [r3], r1
-    vst2.8      {d4[7], d5[7]}, [r3]
-
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_mblf_v_loop
-
-end_vp9_mblf_v_edge
-    pop         {r4-r5, pc}
-    ENDP        ; |vp9_mbloop_filter_vertical_edge_neon|
-
-; void vp9_mbloop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d0    op2
-; d1    op1
-; d2    op0
-; d3    oq0
-; d4    oq1
-; d5    oq2
-|vp9_mbloop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
-    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
-    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
-
-    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
-
-    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
-
-    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
-
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
-    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
-    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d1, d19
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
-    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-
-    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
-
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
-
-    vmov.u8     d23, #1
-    vcge.u8     d24, d0, d24               ; a > blimit
-
-    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
-
-    vcge.u8     d20, d23, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
-
-    vand        d20, d20, d19              ; flat & mask
-
-    vmov.u8     d22, #0x80
-
-    vorr        d23, d21, d23              ; hev
-
-    ; This instruction will truncate the "flat & mask" masks down to 4 bits
-    ; each to fit into one 32 bit arm register. The values are stored in
-    ; q10.64[0].
-    vshrn.u16   d30, q10, #4
-    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
-
-    adds        r5, r4, #1                 ; Check for all 1's
-
-    ; If mask and flat are 1's for all vectors, then we only need to execute
-    ; the power branch for all vectors.
-    beq         power_branch_only
-
-    cmp         r4, #0                     ; Check for 0, set flag for later
-
-    ; mbfilter() function
-    ; filter() function
-    ; convert to signed
-    veor        d21, d7, d22               ; qs0
-    veor        d24, d6, d22               ; ps0
-    veor        d25, d5, d22               ; ps1
-    veor        d26, d16, d22              ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
-
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
-    vand        d29, d29, d23              ; filter &= hev
-
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d23              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    ; If mask and flat are 0's for all vectors, then we only need to execute
-    ; the filter branch for all vectors.
-    beq         filter_branch_only
-
-    ; If mask and flat are mixed then we must perform both branches and
-    ; combine the data.
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d21, d21, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    ; At this point we have already executed the filter branch. The filter
-    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
-    ; branch and combine the data.
-    vmov.u8     d23, #2
-    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
-    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
-
-    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
-
-    vaddw.u8    q14, d5                    ; r_op2 += p1
-
-    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
-
-    vqrshrn.u16 d30, q14, #3               ; r_op2
-
-    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
-    vsubw.u8    q14, d4                    ; r_op1 -= p2
-    vaddw.u8    q14, d5                    ; r_op1 += p1
-    vaddw.u8    q14, d16                   ; r_op1 += q1
-
-    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
-
-    vqrshrn.u16 d31, q14, #3               ; r_op1
-
-    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
-    vsubw.u8    q14, d5                    ; r_op0 -= p1
-    vaddw.u8    q14, d6                    ; r_op0 += p0
-    vaddw.u8    q14, d17                   ; r_op0 += q2
-
-    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
-
-    vqrshrn.u16 d23, q14, #3               ; r_op0
-
-    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
-    vsubw.u8    q14, d6                    ; r_oq0 -= p0
-    vaddw.u8    q14, d7                    ; r_oq0 += q0
-
-    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
-
-    vaddw.u8    q14, d18                   ; oq0 += q3
-
-    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
-
-    vqrshrn.u16 d22, q14, #3               ; r_oq0
-
-    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
-    vsubw.u8    q14, d7                    ; r_oq1 -= q0
-    vaddw.u8    q14, d16                   ; r_oq1 += q1
-
-    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
-
-    vaddw.u8    q14, d18                   ; r_oq1 += q3
-
-    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
-
-    vqrshrn.u16 d6, q14, #3                ; r_oq1
-
-    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
-    vsubw.u8    q14, d16                   ; r_oq2 -= q1
-    vaddw.u8    q14, d17                   ; r_oq2 += q2
-    vaddw.u8    q14, d18                   ; r_oq2 += q3
-
-    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
-
-    vqrshrn.u16 d7, q14, #3                ; r_oq2
-
-    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
-    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
-    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
-
-    bx          lr
-
-power_branch_only
-    vmov.u8     d27, #3
-    vmov.u8     d21, #2
-    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
-    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
-    vaddw.u8    q14, d5                    ; op2 += p1
-    vqrshrn.u16 d0, q14, #3                ; op2
-
-    vsubw.u8    q14, d3                    ; op1 = op2 - p3
-    vsubw.u8    q14, d4                    ; op1 -= p2
-    vaddw.u8    q14, d5                    ; op1 += p1
-    vaddw.u8    q14, d16                   ; op1 += q1
-    vqrshrn.u16 d1, q14, #3                ; op1
-
-    vsubw.u8    q14, d3                    ; op0 = op1 - p3
-    vsubw.u8    q14, d5                    ; op0 -= p1
-    vaddw.u8    q14, d6                    ; op0 += p0
-    vaddw.u8    q14, d17                   ; op0 += q2
-    vqrshrn.u16 d2, q14, #3                ; op0
-
-    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
-    vsubw.u8    q14, d6                    ; oq0 -= p0
-    vaddw.u8    q14, d7                    ; oq0 += q0
-    vaddw.u8    q14, d18                   ; oq0 += q3
-    vqrshrn.u16 d3, q14, #3                ; oq0
-
-    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
-    vsubw.u8    q14, d7                    ; oq1 -= q0
-    vaddw.u8    q14, d16                   ; oq1 += q1
-    vaddw.u8    q14, d18                   ; oq1 += q3
-    vqrshrn.u16 d4, q14, #3                ; oq1
-
-    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
-    vsubw.u8    q14, d16                   ; oq2 -= q1
-    vaddw.u8    q14, d17                   ; oq2 += q2
-    vaddw.u8    q14, d18                   ; oq2 += q3
-    vqrshrn.u16 d5, q14, #3                ; oq2
-
-    bx          lr
-
-filter_branch_only
-    ; TODO(fgalligan): See if we can rearange registers so we do not need to
-    ; do the 2 vswp.
-    vswp        d0, d4                      ; op2
-    vswp        d5, d17                     ; oq2
-    veor        d2, d24, d22                ; *op0 = u^0x80
-    veor        d3, d21, d22                ; *oq0 = u^0x80
-    veor        d1, d25, d22                ; *op1 = u^0x80
-    veor        d4, d26, d22                ; *oq1 = u^0x80
-
-    bx          lr
-
-    ENDP        ; |vp9_mbloop_filter_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -1,618 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_mb_lpf_horizontal_edge_w_neon|
-    EXPORT  |vp9_mb_lpf_vertical_edge_w_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
-;                                        const uint8_t *blimit,
-;                                        const uint8_t *limit,
-;                                        const uint8_t *thresh
-;                                        int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|vp9_mb_lpf_horizontal_edge_w_neon| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-    ldr         r12, [sp, #92]             ; load count
-
-h_count
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8@64], r1          ; p7
-    vld1.u8     {d1}, [r8@64], r1          ; p6
-    vld1.u8     {d2}, [r8@64], r1          ; p5
-    vld1.u8     {d3}, [r8@64], r1          ; p4
-    vld1.u8     {d4}, [r8@64], r1          ; p3
-    vld1.u8     {d5}, [r8@64], r1          ; p2
-    vld1.u8     {d6}, [r8@64], r1          ; p1
-    vld1.u8     {d7}, [r8@64], r1          ; p0
-    vld1.u8     {d8}, [r8@64], r1          ; q0
-    vld1.u8     {d9}, [r8@64], r1          ; q1
-    vld1.u8     {d10}, [r8@64], r1         ; q2
-    vld1.u8     {d11}, [r8@64], r1         ; q3
-    vld1.u8     {d12}, [r8@64], r1         ; q4
-    vld1.u8     {d13}, [r8@64], r1         ; q5
-    vld1.u8     {d14}, [r8@64], r1         ; q6
-    vld1.u8     {d15}, [r8@64], r1         ; q7
-
-    bl          vp9_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8@64], r1         ; store op1
-    vst1.u8     {d24}, [r8@64], r1         ; store op0
-    vst1.u8     {d23}, [r8@64], r1         ; store oq0
-    vst1.u8     {d26}, [r8@64], r1         ; store oq1
-
-    b           h_next
-
-h_mbfilter
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8@64], r1         ; store op2
-    vst1.u8     {d19}, [r8@64], r1         ; store op1
-    vst1.u8     {d20}, [r8@64], r1         ; store op0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq0
-    vst1.u8     {d22}, [r8@64], r1         ; store oq1
-    vst1.u8     {d23}, [r8@64], r1         ; store oq2
-
-    b           h_next
-
-h_wide_mbfilter
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8@64], r1         ; store op6
-    vst1.u8     {d24}, [r8@64], r1         ; store op5
-    vst1.u8     {d25}, [r8@64], r1         ; store op4
-    vst1.u8     {d26}, [r8@64], r1         ; store op3
-    vst1.u8     {d27}, [r8@64], r1         ; store op2
-    vst1.u8     {d18}, [r8@64], r1         ; store op1
-    vst1.u8     {d19}, [r8@64], r1         ; store op0
-    vst1.u8     {d20}, [r8@64], r1         ; store oq0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq1
-    vst1.u8     {d22}, [r8@64], r1         ; store oq2
-    vst1.u8     {d23}, [r8@64], r1         ; store oq3
-    vst1.u8     {d1}, [r8@64], r1          ; store oq4
-    vst1.u8     {d2}, [r8@64], r1          ; store oq5
-    vst1.u8     {d3}, [r8@64], r1          ; store oq6
-
-h_next
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |vp9_mb_lpf_horizontal_edge_w_neon|
-
-; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
-;                                        const uint8_t *blimit,
-;                                        const uint8_t *limit,
-;                                        const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|vp9_mb_lpf_vertical_edge_w_neon| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8@64], r1
-    vld1.8      {d8}, [r0@64], r1
-    vld1.8      {d1}, [r8@64], r1
-    vld1.8      {d9}, [r0@64], r1
-    vld1.8      {d2}, [r8@64], r1
-    vld1.8      {d10}, [r0@64], r1
-    vld1.8      {d3}, [r8@64], r1
-    vld1.8      {d11}, [r0@64], r1
-    vld1.8      {d4}, [r8@64], r1
-    vld1.8      {d12}, [r0@64], r1
-    vld1.8      {d5}, [r8@64], r1
-    vld1.8      {d13}, [r0@64], r1
-    vld1.8      {d6}, [r8@64], r1
-    vld1.8      {d14}, [r0@64], r1
-    vld1.8      {d7}, [r8@64], r1
-    vld1.8      {d15}, [r0@64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          vp9_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
-    b           v_end
-
-v_mbfilter
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_end
-
-v_wide_mbfilter
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8@64], r1
-    vst1.8      {d20}, [r0@64], r1
-    vst1.8      {d16}, [r8@64], r1
-    vst1.8      {d21}, [r0@64], r1
-    vst1.8      {d24}, [r8@64], r1
-    vst1.8      {d22}, [r0@64], r1
-    vst1.8      {d25}, [r8@64], r1
-    vst1.8      {d23}, [r0@64], r1
-    vst1.8      {d26}, [r8@64], r1
-    vst1.8      {d1}, [r0@64], r1
-    vst1.8      {d27}, [r8@64], r1
-    vst1.8      {d2}, [r0@64], r1
-    vst1.8      {d18}, [r8@64], r1
-    vst1.8      {d3}, [r0@64], r1
-    vst1.8      {d19}, [r8@64], r1
-    vst1.8      {d15}, [r0@64], r1
-
-v_end
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |vp9_mb_lpf_vertical_edge_w_neon|
-
-; void vp9_wide_mbfilter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-;
-; r0-r3 PRESERVE
-; d16    blimit
-; d17    limit
-; d18    thresh
-; d0    p7
-; d1    p6
-; d2    p5
-; d3    p4
-; d4    p3
-; d5    p2
-; d6    p1
-; d7    p0
-; d8    q0
-; d9    q1
-; d10   q2
-; d11   q3
-; d12   q4
-; d13   q5
-; d14   q6
-; d15   q7
-|vp9_wide_mbfilter_neon| PROC
-    mov         r7, #0
-
-    ; filter_mask
-    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
-    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
-    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
-    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
-    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
-    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d17, d19
-
-    ; flatmask4
-    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
-    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
-    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
-    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    ; hevmask
-    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              ; hev
-
-    vand        d16, d20, d19              ; flat && mask
-    vmov        r5, r6, d16
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
-
-    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
-    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
-    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
-    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
-    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
-    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
-    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
-    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23              ; flat2
-
-    vmov.u8     d22, #0x80
-
-    vand        d17, d18, d16              ; flat2 && flat && mask
-    vmov        r5, r6, d17
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch
-
-    ; mbfilter() function
-
-    ; filter() function
-    ; convert to signed
-    veor        d23, d8, d22               ; qs0
-    veor        d24, d7, d22               ; ps0
-    veor        d25, d6, d22               ; ps1
-    veor        d26, d9, d22               ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
-    vand        d29, d29, d21              ; filter &= hev
-
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d23, d23, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    ; mbfilter flat && mask branch
-    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    ; and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
-    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vqrshrn.u16 d18, q15, #3               ; r_op2
-
-    vsubw.u8    q15, d4                    ; op1 = op2 - p3
-    vsubw.u8    q15, d5                    ; op1 -= p2
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d9                    ; op1 += q1
-    vqrshrn.u16 d19, q15, #3               ; r_op1
-
-    vsubw.u8    q15, d4                    ; op0 = op1 - p3
-    vsubw.u8    q15, d6                    ; op0 -= p1
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d10                   ; op0 += q2
-    vqrshrn.u16 d20, q15, #3               ; r_op0
-
-    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
-    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d11                   ; oq0 += q3
-    vqrshrn.u16 d21, q15, #3               ; r_oq0
-
-    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
-    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddw.u8    q15, d11                   ; oq1 += q3
-    vqrshrn.u16 d22, q15, #3               ; r_oq1
-
-    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d11                   ; oq2 += q3
-    vqrshrn.u16 d27, q15, #3               ; r_oq2
-
-    ; Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-    ; wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
-    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vmlal.u8    q15, d1, d29               ; op6 += p6 * 2
-    vaddw.u8    q15, d2                    ; op6 += p5
-    vaddw.u8    q15, d3                    ; op6 += p4
-    vaddw.u8    q15, d4                    ; op6 += p3
-    vaddw.u8    q15, d5                    ; op6 += p2
-    vaddw.u8    q15, d6                    ; op6 += p1
-    vqrshrn.u16 d16, q15, #4               ; w_op6
-
-    vsubw.u8    q15, d0                    ; op5 = op6 - p7
-    vsubw.u8    q15, d1                    ; op5 -= p6
-    vaddw.u8    q15, d2                    ; op5 += p5
-    vaddw.u8    q15, d9                    ; op5 += q1
-    vqrshrn.u16 d24, q15, #4               ; w_op5
-
-    vsubw.u8    q15, d0                    ; op4 = op5 - p7
-    vsubw.u8    q15, d2                    ; op4 -= p5
-    vaddw.u8    q15, d3                    ; op4 += p4
-    vaddw.u8    q15, d10                   ; op4 += q2
-    vqrshrn.u16 d25, q15, #4               ; w_op4
-
-    vsubw.u8    q15, d0                    ; op3 = op4 - p7
-    vsubw.u8    q15, d3                    ; op3 -= p4
-    vaddw.u8    q15, d4                    ; op3 += p3
-    vaddw.u8    q15, d11                   ; op3 += q3
-    vqrshrn.u16 d26, q15, #4               ; w_op3
-
-    vsubw.u8    q15, d0                    ; op2 = op3 - p7
-    vsubw.u8    q15, d4                    ; op2 -= p3
-    vaddw.u8    q15, d5                    ; op2 += p2
-    vaddw.u8    q15, d12                   ; op2 += q4
-    vqrshrn.u16 d27, q15, #4               ; w_op2
-
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op1 = op2 - p7
-    vsubw.u8    q15, d5                    ; op1 -= p2
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d13                   ; op1 += q5
-    vqrshrn.u16 d18, q15, #4               ; w_op1
-
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op0 = op1 - p7
-    vsubw.u8    q15, d6                    ; op0 -= p1
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d14                   ; op0 += q6
-    vqrshrn.u16 d19, q15, #4               ; w_op0
-
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; oq0 = op0 - p7
-    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d15                   ; oq0 += q7
-    vqrshrn.u16 d20, q15, #4               ; w_oq0
-
-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d1                    ; oq1 = oq0 - p6
-    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddw.u8    q15, d15                   ; oq1 += q7
-    vqrshrn.u16 d21, q15, #4               ; w_oq1
-
-    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d2                    ; oq2 = oq1 - p5
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d15                   ; oq2 += q7
-    vqrshrn.u16 d22, q15, #4               ; w_oq2
-
-    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d3                    ; oq3 = oq2 - p4
-    vsubw.u8    q15, d10                   ; oq3 -= q2
-    vaddw.u8    q15, d11                   ; oq3 += q3
-    vaddw.u8    q15, d15                   ; oq3 += q7
-    vqrshrn.u16 d23, q15, #4               ; w_oq3
-
-    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d4                    ; oq4 = oq3 - p3
-    vsubw.u8    q15, d11                   ; oq4 -= q3
-    vaddw.u8    q15, d12                   ; oq4 += q4
-    vaddw.u8    q15, d15                   ; oq4 += q7
-    vqrshrn.u16 d1, q15, #4                ; w_oq4
-
-    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d5                    ; oq5 = oq4 - p2
-    vsubw.u8    q15, d12                   ; oq5 -= q4
-    vaddw.u8    q15, d13                   ; oq5 += q5
-    vaddw.u8    q15, d15                   ; oq5 += q7
-    vqrshrn.u16 d2, q15, #4                ; w_oq5
-
-    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d6                    ; oq6 = oq5 - p1
-    vsubw.u8    q15, d13                   ; oq6 -= q5
-    vaddw.u8    q15, d14                   ; oq6 += q6
-    vaddw.u8    q15, d15                   ; oq6 += q7
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
-    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
-    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-    ENDP        ; |vp9_wide_mbfilter_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -1,356 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_idct8x8_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
-    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
-    ; This macro will touch q0-q7 registers and use them as buffer during
-    ; calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[7] * cospi_4_64
-    vmull.s16       q4, d30, d1
-    vmull.s16       q5, d31, d1
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vsub.s32        q6, q2, q4
-    vsub.s32        q7, q3, q5
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q6, #14               ; >> 14
-    vqrshrn.s32     d9, q7, #14               ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[7] * cospi_28_64
-    vmull.s16       q1, d30, d0
-    vmull.s16       q5, d31, d0
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vadd.s32        q2, q2, q1
-    vadd.s32        q3, q3, q5
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    vdup.16         d0, r5                    ; duplicate cospi_12_64
-    vdup.16         d1, r6                    ; duplicate cospi_20_64
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q2, d26, d0
-    vmull.s16       q3, d27, d0
-
-    ; input[3] * cospi_20_64
-    vmull.s16       q5, d22, d1
-    vmull.s16       q6, d23, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vsub.s32        q2, q2, q5
-    vsub.s32        q3, q3, q6
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q2, d26, d1
-    vmull.s16       q3, d27, d1
-
-    ; input[3] * cospi_12_64
-    vmull.s16       q9, d22, d0
-    vmull.s16       q15, d23, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vadd.s32        q0, q2, q9
-    vadd.s32        q1, q3, q15
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q0, #14              ; >> 14
-    vqrshrn.s32     d13, q1, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[2] * cospi_16_64
-    vmull.s16       q9,  d24, d0
-    vmull.s16       q11, d25, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vadd.s32        q9, q2, q9
-    vadd.s32        q11, q3, q11
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q11, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[2] * cospi_16_64
-    vmull.s16       q13,  d24, d0
-    vmull.s16       q15, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q2, #14              ; >> 14
-    vqrshrn.s32     d23, q3, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[3] * cospi_8_64
-    vmull.s16       q13, d28, d1
-    vmull.s16       q15, d29, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q2, d20, d1
-    vmull.s16       q3, d21, d1
-
-    ; input[3] * cospi_24_64
-    vmull.s16       q8, d28, d0
-    vmull.s16       q10, d29, d0
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vadd.s32        q0, q2, q8
-    vadd.s32        q1, q3, q10
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q0, #14              ; >> 14
-    vqrshrn.s32     d31, q1, #14              ; >> 14
-
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q9, q9, q11
-    vsub.s32        q10, q10, q12
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct8x8_add_neon| PROC
-    push            {r4-r9}
-    vld1.s16        {q8}, [r0]!
-    vld1.s16        {q9}, [r0]!
-    vld1.s16        {q10}, [r0]!
-    vld1.s16        {q11}, [r0]!
-    vld1.s16        {q12}, [r0]!
-    vld1.s16        {q13}, [r0]!
-    vld1.s16        {q14}, [r0]!
-    vld1.s16        {q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    IDCT8x8_1D
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vp9_short_idct8x8_add_neon|
-
-    END
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -11,7 +11,6 @@

 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
-
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
@@ -53,6 +52,7 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) {
  for (i = 0; i < NUM_YV12_BUFFERS; i++)
    vp9_free_frame_buffer(&oci->yv12_fb[i]);

+  vp9_free_frame_buffer(&oci->temp_scale_frame);
  vp9_free_frame_buffer(&oci->post_proc_buffer);

  vpx_free(oci->mip);
@@ -62,9 +62,9 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) {
  vpx_free(oci->above_context[0]);
  for (i = 0; i < MAX_MB_PLANE; i++)
    oci->above_context[i] = 0;
-  oci->mip = NULL;
-  oci->prev_mip = NULL;
-  oci->above_seg_context = NULL;
+  oci->mip = 0;
+  oci->prev_mip = 0;
+  oci->above_seg_context = 0;
 }

 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -74,7 +74,7 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {

  cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
  cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
-  cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
+  cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE;
 }

 static void setup_mi(VP9_COMMON *cm) {
@@ -94,11 +94,11 @@ static void setup_mi(VP9_COMMON *cm) {
 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
  int i, mi_cols;

-  const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
+  // Our internal buffers are always multiples of 16
+  const int aligned_width = multiple8(width);
+  const int aligned_height = multiple8(height);
  const int ss_x = oci->subsampling_x;
  const int ss_y = oci->subsampling_y;
-  int mi_size;

  vp9_free_frame_buffers(oci);

@@ -120,6 +120,10 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
    oci->fb_idx_ref_cnt[i] = 1;
  }

+  if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y,
+                             VP9BORDERINPIXELS) < 0)
+    goto fail;
+
  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9BORDERINPIXELS) < 0)
    goto fail;
@@ -127,13 +131,14 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
  set_mb_mi(oci, aligned_width, aligned_height);

  // Allocation
-  mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
-
-  oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE),
+                        sizeof(MODE_INFO));
  if (!oci->mip)
    goto fail;

-  oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  oci->prev_mip = vpx_calloc(oci->mode_info_stride *
+                             (oci->mi_rows + 64 / MI_SIZE),
+                             sizeof(MODE_INFO));
  if (!oci->prev_mip)
    goto fail;

@@ -141,7 +146,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {

  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
  // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
+  mi_cols = mi_cols_aligned_to_sb(oci);

  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
  // block where mi unit size is 8x8.
@@ -169,8 +174,9 @@ void vp9_create_common(VP9_COMMON *oci) {

  vp9_init_mbmode_probs(oci);

-  oci->tx_mode = ONLY_4X4;
+  oci->txfm_mode = ONLY_4X4;
  oci->comp_pred_mode = HYBRID_PREDICTION;
+  oci->clr_type = REG_YUV;

  // Initialize reference frame sign bias structure to defaults
  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
@@ -188,13 +194,13 @@ void vp9_initialize_common() {

 void vp9_update_frame_size(VP9_COMMON *cm) {
  int i, mi_cols;
-  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
+  const int aligned_width = multiple8(cm->width);
+  const int aligned_height = multiple8(cm->height);

  set_mb_mi(cm, aligned_width, aligned_height);
  setup_mi(cm);

-  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  mi_cols = mi_cols_aligned_to_sb(cm);
  for (i = 1; i < MAX_MB_PLANE; i++)
    cm->above_context[i] =
        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
--- a/vp9/common/vp9_asm_com_offsets.c
+++ b/vp9/common/vp9_asm_com_offsets.c
@@ -0,0 +1,21 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+
+BEGIN
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -13,22 +13,28 @@
 #define VP9_COMMON_VP9_BLOCKD_H_

 #include "./vpx_config.h"
-
-#include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_common_data.h"
 #include "vp9/common/vp9_convolve.h"
-#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"

 #define BLOCK_SIZE_GROUPS   4
+#define MAX_MB_SEGMENTS     8
+#define MB_SEG_TREE_PROBS   (MAX_MB_SEGMENTS-1)
+
+#define PREDICTION_PROBS 3
+
 #define MBSKIP_CONTEXTS 3

+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      2
+
 /* Segment Feature Masks */
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
 #define MAX_MV_REF_CANDIDATES 2

 #define INTRA_INTER_CONTEXTS 4
@@ -56,11 +62,11 @@ typedef enum {
 } FRAME_TYPE;

 typedef enum {
-  EIGHTTAP = 0,
-  EIGHTTAP_SMOOTH = 1,
-  EIGHTTAP_SHARP = 2,
-  BILINEAR = 3,
-  SWITCHABLE = 4  /* should be the last one */
+  EIGHTTAP_SMOOTH,
+  EIGHTTAP,
+  EIGHTTAP_SHARP,
+  BILINEAR,
+  SWITCHABLE  /* should be the last one */
 } INTERPOLATIONFILTERTYPE;

 typedef enum {
@@ -81,37 +87,56 @@ typedef enum {
  MB_MODE_COUNT
 } MB_PREDICTION_MODE;

-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
-  return mode <= TM_PRED;
-}
-
 static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
  return mode >= NEARESTMV && mode <= NEWMV;
 }

-#if CONFIG_FILTERINTRA
-static INLINE int is_filter_allowed(MB_PREDICTION_MODE mode) {
-  return mode != DC_PRED &&
-         mode != D45_PRED &&
-         mode != D27_PRED &&
-         mode != D63_PRED;
-}
-#endif
+// Segment level features.
+typedef enum {
+  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
+  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
+  SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode
+  SEG_LVL_MAX = 4                  // Number of MB level features supported
+} SEG_LVL_FEATURES;
+
+// Segment level features.
+typedef enum {
+  TX_4X4 = 0,                      // 4x4 dct transform
+  TX_8X8 = 1,                      // 8x8 dct transform
+  TX_16X16 = 2,                    // 16x16 dct transform
+  TX_32X32 = 3,                    // 32x32 dct transform
+  TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
+} TX_SIZE;
+
+typedef enum {
+  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
+  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
+  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
+  ADST_ADST = 3                       // ADST in both directions
+} TX_TYPE;

 #define VP9_INTRA_MODES (TM_PRED + 1)

 #define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)

-static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
-  return (mode - NEARESTMV);
-}
+#define WHT_UPSCALE_FACTOR 2
+
+#define TX_SIZE_PROBS  6  // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)
+
+#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \
+                            (c)->fc.tx_probs_8x8p :    \
+                            (b) < BLOCK_SIZE_SB32X32 ? \
+                            (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)

 /* For keyframes, intra block modes are predicted by the (already decoded)
   modes for the Y blocks to the left and above us; for interframes, there
   is a single probability table. */

 union b_mode_info {
-  MB_PREDICTION_MODE as_mode;
+  struct {
+    MB_PREDICTION_MODE first;
+  } as_mode;
  int_mv as_mv[2];  // first, second inter predictor motion vectors
 };

@@ -125,101 +150,71 @@ typedef enum {
 } MV_REFERENCE_FRAME;

 static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
-  return b_width_log2_lookup[sb_type];
-}
-static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
-  return b_height_log2_lookup[sb_type];
-}
-
-static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
-  return mi_width_log2_lookup[sb_type];
-}
-
-static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
-  return mi_height_log2_lookup[sb_type];
-}
-
-#if CONFIG_INTERINTRA
-static INLINE TX_SIZE intra_size_log2_for_interintra(int bs) {
-  switch (bs) {
-    case 4:
-      return TX_4X4;
-      break;
-    case 8:
-      return TX_8X8;
-      break;
-    case 16:
-      return TX_16X16;
-      break;
-    case 32:
-      return TX_32X32;
-      break;
-    default:
-      return TX_32X32;
-      break;
+  switch (sb_type) {
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_AB4X4: return 0;
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_SB8X8:
+    case BLOCK_SIZE_SB8X16: return 1;
+    case BLOCK_SIZE_SB16X8:
+    case BLOCK_SIZE_MB16X16:
+    case BLOCK_SIZE_SB16X32: return 2;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB32X32:
+    case BLOCK_SIZE_SB32X64: return 3;
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB64X64: return 4;
+    default: assert(0);
+      return -1;
  }
 }

-static INLINE int is_interintra_allowed(BLOCK_SIZE_TYPE sb_type) {
-  return ((sb_type >= BLOCK_8X8) && (sb_type < BLOCK_64X64));
+static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
+  switch (sb_type) {
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_AB4X4: return 0;
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_SB8X8:
+    case BLOCK_SIZE_SB16X8: return 1;
+    case BLOCK_SIZE_SB8X16:
+    case BLOCK_SIZE_MB16X16:
+    case BLOCK_SIZE_SB32X16: return 2;
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_SB32X32:
+    case BLOCK_SIZE_SB64X32: return 3;
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB64X64: return 4;
+    default: assert(0);
+      return -1;
+  }
 }

-#if CONFIG_MASKED_INTERINTRA
-#define MASK_BITS_SML_INTERINTRA   3
-#define MASK_BITS_MED_INTERINTRA   4
-#define MASK_BITS_BIG_INTERINTRA   5
-#define MASK_NONE_INTERINTRA      -1
-static INLINE int get_mask_bits_interintra(BLOCK_SIZE_TYPE sb_type) {
-  if (sb_type == BLOCK_4X4)
-     return 0;
-  if (sb_type <= BLOCK_8X8)
-    return MASK_BITS_SML_INTERINTRA;
-  else if (sb_type <= BLOCK_32X32)
-    return MASK_BITS_MED_INTERINTRA;
-  else
-    return MASK_BITS_BIG_INTERINTRA;
+static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
+  int a = b_width_log2(sb_type) - 1;
+  // align 4x4 block to mode_info
+  if (a < 0)
+    a = 0;
+  assert(a >= 0);
+  return a;
 }
-#endif
-#endif

-#if CONFIG_MASKED_INTERINTER
-#define MASK_BITS_SML   3
-#define MASK_BITS_MED   4
-#define MASK_BITS_BIG   5
-#define MASK_NONE      -1
-
-static inline int get_mask_bits(BLOCK_SIZE_TYPE sb_type) {
-  if (sb_type == BLOCK_4X4)
-     return 0;
-  if (sb_type <= BLOCK_8X8)
-    return MASK_BITS_SML;
-  else if (sb_type <= BLOCK_32X32)
-    return MASK_BITS_MED;
-  else
-    return MASK_BITS_BIG;
+static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
+  int a = b_height_log2(sb_type) - 1;
+  if (a < 0)
+    a = 0;
+  assert(a >= 0);
+  return a;
 }
-#endif

 typedef struct {
  MB_PREDICTION_MODE mode, uv_mode;
-#if CONFIG_INTERINTRA
-  MB_PREDICTION_MODE interintra_mode, interintra_uv_mode;
-#if CONFIG_MASKED_INTERINTRA
-  int interintra_mask_index;
-  int interintra_uv_mask_index;
-  int use_masked_interintra;
-#endif
-#endif
-#if CONFIG_FILTERINTRA
-  int filterbit, uv_filterbit;
-#endif
  MV_REFERENCE_FRAME ref_frame[2];
  TX_SIZE txfm_size;
  int_mv mv[2]; // for each reference frame used
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  int_mv best_mv, best_second_mv;

-  uint8_t mb_mode_context[MAX_REF_FRAMES];
+  int mb_mode_context[MAX_REF_FRAMES];

  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
  unsigned char segment_id;           // Segment id for current frame
@@ -235,33 +230,15 @@ typedef struct {
  INTERPOLATIONFILTERTYPE interp_filter;

  BLOCK_SIZE_TYPE sb_type;
-
-#if CONFIG_MASKED_INTERINTER
-  int use_masked_compound;
-  int mask_index;
-#endif
 } MB_MODE_INFO;

 typedef struct {
  MB_MODE_INFO mbmi;
-#if CONFIG_FILTERINTRA
-  int b_filter_info[4];
-#endif
  union b_mode_info bmi[4];
 } MODE_INFO;

-static int is_inter_block(const MB_MODE_INFO *mbmi) {
-  return mbmi->ref_frame[0] > INTRA_FRAME;
-}
-
-
-enum mv_precision {
-  MV_PRECISION_Q3,
-  MV_PRECISION_Q4
-};
-
 #define VP9_REF_SCALE_SHIFT 14
-#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
+#define VP9_REF_NO_SCALE 16384

 struct scale_factors {
  int x_scale_fp;   // horizontal fixed point scale factor
@@ -274,8 +251,9 @@ struct scale_factors {
  int (*scale_value_x)(int val, const struct scale_factors *scale);
  int (*scale_value_y)(int val, const struct scale_factors *scale);
  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
-  MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
-  MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
+  int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv,
+                                const struct scale_factors *scale);
+  int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4);

  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 };
@@ -307,53 +285,71 @@ struct macroblockd_plane {

 #define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))

-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      2
-
-struct loopfilter {
-  int filter_level;
-
-  int sharpness_level;
-  int last_sharpness_level;
-
-  uint8_t mode_ref_delta_enabled;
-  uint8_t mode_ref_delta_update;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char ref_deltas[MAX_REF_LF_DELTAS];
-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
-
-  // 0 = ZERO_MV, MV
-  signed char mode_deltas[MAX_MODE_LF_DELTAS];
-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
-};
-
 typedef struct macroblockd {
  struct macroblockd_plane plane[MAX_MB_PLANE];

  struct scale_factors scale_factor[2];
+  struct scale_factors scale_factor_uv[2];

  MODE_INFO *prev_mode_info_context;
  MODE_INFO *mode_info_context;
  int mode_info_stride;

+  FRAME_TYPE frame_type;
+
  int up_available;
  int left_available;
  int right_available;

-  struct segmentation seg;
-  struct loopfilter lf;
-
  // partition contexts
  PARTITION_CONTEXT *above_seg_context;
  PARTITION_CONTEXT *left_seg_context;

+  /* 0 (disable) 1 (enable) segmentation */
+  unsigned char segmentation_enabled;
+
+  /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+  unsigned char update_mb_segmentation_map;
+
+  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+  unsigned char update_mb_segmentation_data;
+
+  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+  unsigned char mb_segment_abs_delta;
+
+  /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+  /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+
+  // Probability Tree used to code Segment number
+  vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS];
+
+  // Segment features
+  int16_t segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
+  unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
+
+  /* mode_based Loop filter adjustment */
+  unsigned char mode_ref_lf_delta_enabled;
+  unsigned char mode_ref_lf_delta_update;
+
+  /* Delta values have the range +/- MAX_LOOP_FILTER */
+  /* 0 = Intra, Last, GF, ARF */
+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
+  /* 0 = Intra, Last, GF, ARF */
+  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
+  /* 0 = ZERO_MV, MV */
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+  /* 0 = ZERO_MV, MV */
+  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
  /* Distance of MB away from frame edges */
  int mb_to_left_edge;
  int mb_to_right_edge;
  int mb_to_top_edge;
  int mb_to_bottom_edge;

+  unsigned int frames_since_golden;
+  unsigned int frames_till_alt_ref_frame;
+
  int lossless;
  /* Inverse transform function pointers. */
  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
@@ -366,33 +362,32 @@ typedef struct macroblockd {

  int corrupted;

-  unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
-  unsigned char mb_index;   // index of 16x16 block inside the 32x32 block
-  unsigned char b_index;    // index of 8x8 block inside the 16x16 block
-  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
-
+  int sb_index;   // index of 32x32 block inside the 64x64 block
+  int mb_index;   // index of 16x16 block inside the 32x32 block
+  int b_index;    // index of 8x8 block inside the 16x16 block
+  int ab_index;   // index of 4x4 block inside the 8x8 block
  int q_index;

 } MACROBLOCKD;

-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
  switch (subsize) {
-    case BLOCK_64X64:
-    case BLOCK_64X32:
-    case BLOCK_32X64:
-    case BLOCK_32X32:
+    case BLOCK_SIZE_SB64X64:
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB32X32:
      return &xd->sb_index;
-    case BLOCK_32X16:
-    case BLOCK_16X32:
-    case BLOCK_16X16:
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
      return &xd->mb_index;
-    case BLOCK_16X8:
-    case BLOCK_8X16:
-    case BLOCK_8X8:
+    case BLOCK_SIZE_SB16X8:
+    case BLOCK_SIZE_SB8X16:
+    case BLOCK_SIZE_SB8X8:
      return &xd->b_index;
-    case BLOCK_8X4:
-    case BLOCK_4X8:
-    case BLOCK_4X4:
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_AB4X4:
      return &xd->ab_index;
    default:
      assert(0);
@@ -403,28 +398,45 @@ static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsi
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                            BLOCK_SIZE_TYPE sb_type,
                                            BLOCK_SIZE_TYPE sb_size) {
-  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
-  const int bwl = b_width_log2(sb_type);
-  const int bhl = b_height_log2(sb_type);
-  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
-  const char pcval0 = ~(0xe << boffset);
-  const char pcval1 = ~(0xf << boffset);
-  const char pcvalue[2] = {pcval0, pcval1};
-
-  assert(MAX(bwl, bhl) <= bsl);
+  int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+  int bwl = b_width_log2(sb_type);
+  int bhl = b_height_log2(sb_type);
+  int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  int i;

  // update the partition context at the end notes. set partition bits
  // of block sizes larger than the current one to be one, and partition
  // bits of smaller block sizes to be zero.
-  vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
-  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
+  if ((bwl == bsl) && (bhl == bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xf << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xf << boffset);
+  } else if ((bwl == bsl) && (bhl < bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xe << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xf << boffset);
+  }  else if ((bwl < bsl) && (bhl == bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xf << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xe << boffset);
+  } else if ((bwl < bsl) && (bhl < bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xe << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xe << boffset);
+  } else {
+    assert(0);
+  }
 }

 static INLINE int partition_plane_context(MACROBLOCKD *xd,
                                          BLOCK_SIZE_TYPE sb_type) {
  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
  int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
+  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;

  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
  assert(bsl >= 0);
@@ -443,57 +455,134 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,

 static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
                                   PARTITION_TYPE partition) {
-  BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
-  assert(subsize != BLOCK_SIZE_TYPES);
+  BLOCK_SIZE_TYPE subsize;
+  switch (partition) {
+    case PARTITION_NONE:
+      subsize = bsize;
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB64X32;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_SB32X16;
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB16X8;
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_SB8X4;
+      else
+        assert(0);
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB32X64;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_SB16X32;
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB8X16;
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_SB4X8;
+      else
+        assert(0);
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB32X32;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_MB16X16;
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB8X8;
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_AB4X4;
+      else
+        assert(0);
+      break;
+    default:
+      assert(0);
+  }
  return subsize;
 }

-extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
+// transform mapping
+static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {
+  switch (bmode) {
+    case TM_PRED :
+    case D135_PRED :
+      return ADST_ADST;

-static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
-                                      const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+    case V_PRED :
+    case D117_PRED :
+    case D63_PRED:
+      return ADST_DCT;

-  if (plane_type != PLANE_TYPE_Y_WITH_DC ||
-      xd->lossless ||
-      is_inter_block(mbmi))
-    return DCT_DCT;
+    case H_PRED :
+    case D153_PRED :
+    case D27_PRED :
+      return DCT_ADST;

-  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ?
-                       mi->bmi[ib].as_mode : mbmi->mode];
-}
-
-static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
-                                      const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
-}
-
-static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
-                                        const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
-}
-
-static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
-  int i;
-
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
-    xd->plane[i].subsampling_x = i ? ss_x : 0;
-    xd->plane[i].subsampling_y = i ? ss_y : 0;
+    default:
+      return DCT_DCT;
  }
-#if CONFIG_ALPHA
-  // TODO(jkoleszar): Using the Y w/h for now
-  xd->plane[3].subsampling_x = 0;
-  xd->plane[3].subsampling_y = 0;
-#endif
 }

+static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
+  TX_TYPE tx_type;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
+    return DCT_DCT;
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+    tx_type = txfm_map(mi->bmi[ib].as_mode.first);
+  } else {
+    assert(mbmi->mode <= TM_PRED);
+    tx_type = txfm_map(mbmi->mode);
+  }
+  return tx_type;
+}

-static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
+static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
+    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
+  }
+  return tx_type;
+}
+
+static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
+    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
+  }
+  return tx_type;
+}
+
+void vp9_setup_block_dptrs(MACROBLOCKD *xd,
+                           int subsampling_x, int subsampling_y);
+
+static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+  const TX_SIZE size = mbmi->txfm_size;
+
+  switch (mbmi->sb_type) {
+    case BLOCK_SIZE_SB64X64:
+      return size;
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB32X32:
+      if (size == TX_32X32)
+        return TX_16X16;
+      else
+        return size;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
+      if (size == TX_16X16)
+        return TX_8X8;
+      else
+        return size;
+    default:
+      return TX_4X4;
+  }
+
+  return size;
 }

 struct plane_block_idx {
@@ -532,16 +621,6 @@ static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
  return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }

-static INLINE int plane_block_width_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_width_log2(bsize) - plane->subsampling_x);
-}
-
-static INLINE int plane_block_height_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_height_log2(bsize) - plane->subsampling_y);
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                  BLOCK_SIZE_TYPE bsize,
                                                  int ss_txfrm_size,
@@ -587,16 +666,16 @@ static INLINE void foreach_transformed_block_in_plane(
    // it to 4x4 block sizes.
    if (xd->mb_to_right_edge < 0)
      max_blocks_wide +=
-          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+          + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));

    if (xd->mb_to_bottom_edge < 0)
      max_blocks_high +=
-          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+          + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));

    i = 0;
    // Unlike the normal case - in here we have to keep track of the
    // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border.
+    // the unrestricted motion border..
    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
        if (r < max_blocks_high && c < max_blocks_wide)
@@ -654,8 +733,8 @@ static INLINE void foreach_predicted_block_in_plane(
  // size of the predictor to use.
  int pred_w, pred_h;

-  if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
-    assert(bsize == BLOCK_8X8);
+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+    assert(bsize == BLOCK_SIZE_SB8X8);
    pred_w = 0;
    pred_h = 0;
  } else {
@@ -718,11 +797,11 @@ static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
                                       int ss_txfrm_size) {
  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
-  const int tx_cols = 1 << tx_cols_log2;
+  const int tx_cols_lg2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_lg2;
  const int raster_mb = block >> ss_txfrm_size;
  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
-  const int y = raster_mb >> tx_cols_log2 << (txwl);
+  const int y = raster_mb >> tx_cols_lg2 << (txwl);
  return x + (y << bwl);
 }

@@ -733,50 +812,13 @@ static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
                                     int *x, int *y) {
  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
-  const int tx_cols = 1 << tx_cols_log2;
+  const int tx_cols_lg2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_lg2;
  const int raster_mb = block >> ss_txfrm_size;
  *x = (raster_mb & (tx_cols - 1)) << (txwl);
-  *y = raster_mb >> tx_cols_log2 << (txwl);
+  *y = raster_mb >> tx_cols_lg2 << (txwl);
 }

-#if CONFIG_INTERINTRA
-static void extend_for_interintra(MACROBLOCKD* const xd,
-                                  BLOCK_SIZE_TYPE bsize) {
-  int bh = 4 << b_height_log2(bsize), bw = 4 << b_width_log2(bsize);
-  int ystride = xd->plane[0].dst.stride, uvstride = xd->plane[1].dst.stride;
-  uint8_t *pixel_y, *pixel_u, *pixel_v;
-  int ymargin, uvmargin;
-  if (xd->mb_to_bottom_edge < 0) {
-    int r;
-    ymargin = 0 - xd->mb_to_bottom_edge / 8;
-    uvmargin = 0 - xd->mb_to_bottom_edge / 16;
-    pixel_y = xd->plane[0].dst.buf - 1 + (bh - ymargin -1) * ystride;
-    pixel_u = xd->plane[1].dst.buf - 1 + (bh / 2 - uvmargin - 1) * uvstride;
-    pixel_v = xd->plane[2].dst.buf - 1 + (bh / 2 - uvmargin - 1) * uvstride;
-    for (r = 0; r < ymargin; r++)
-      xd->plane[0].dst.buf[-1 + (bh - r -1) * ystride] = *pixel_y;
-    for (r = 0; r < uvmargin; r++) {
-      xd->plane[1].dst.buf[-1 + (bh / 2 - r -1) * uvstride] = *pixel_u;
-      xd->plane[2].dst.buf[-1 + (bh / 2 - r -1) * uvstride] = *pixel_v;
-    }
-  }
-  if (xd->mb_to_right_edge < 0) {
-    ymargin = 0 - xd->mb_to_right_edge / 8;
-    uvmargin = 0 - xd->mb_to_right_edge / 16;
-    pixel_y = xd->plane[0].dst.buf + bw - ymargin - 1 - ystride;
-    pixel_u = xd->plane[1].dst.buf + bw / 2 - uvmargin - 1 - uvstride;
-    pixel_v = xd->plane[2].dst.buf + bw / 2 - uvmargin - 1 - uvstride;
-    vpx_memset(xd->plane[0].dst.buf + bw - ymargin - ystride,
-               *pixel_y, ymargin);
-    vpx_memset(xd->plane[1].dst.buf + bw / 2 - uvmargin - uvstride,
-               *pixel_u, uvmargin);
-    vpx_memset(xd->plane[2].dst.buf + bw / 2 - uvmargin - uvstride,
-               *pixel_v, uvmargin);
-  }
-}
-#endif
-
 static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {
  const int bw = plane_block_width(bsize, &xd->plane[plane]);
@@ -817,39 +859,46 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
  }
 }
 static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                                   int plane, int tx_size_in_blocks,
-                                   int eob, int aoff, int loff,
-                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  struct macroblockd_plane *pd = &xd->plane[plane];
+                                   int plane, int ss_tx_size, int eob, int aoff,
+                                   int loff, ENTROPY_CONTEXT *A,
+                                   ENTROPY_CONTEXT *L) {
+  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+  const int sw = bw - xd->plane[plane].subsampling_x;
+  const int sh = bh - xd->plane[plane].subsampling_y;
+  int mi_blocks_wide = 1 << sw;
+  int mi_blocks_high = 1 << sh;
+  int tx_size_in_blocks = (1 << ss_tx_size);
  int above_contexts = tx_size_in_blocks;
  int left_contexts = tx_size_in_blocks;
-  int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
-  int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
  int pt;

  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
  // it to 4x4 block sizes.
-  if (xd->mb_to_right_edge < 0)
-    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  if (xd->mb_to_right_edge < 0) {
+    mi_blocks_wide += (xd->mb_to_right_edge
+        >> (5 + xd->plane[plane].subsampling_x));
+  }

  // this code attempts to avoid copying into contexts that are outside
  // our border.  Any blocks that do are set to 0...
  if (above_contexts + aoff > mi_blocks_wide)
    above_contexts = mi_blocks_wide - aoff;

-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-  if (left_contexts + loff > mi_blocks_high)
+  if (xd->mb_to_bottom_edge < 0) {
+    mi_blocks_high += (xd->mb_to_bottom_edge
+        >> (5 + xd->plane[plane].subsampling_y));
+  }
+  if (left_contexts + loff > mi_blocks_high) {
    left_contexts = mi_blocks_high - loff;
+  }

  for (pt = 0; pt < above_contexts; pt++)
    A[pt] = eob > 0;
-  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
+  for (pt = above_contexts; pt < (1 << ss_tx_size); pt++)
    A[pt] = 0;
  for (pt = 0; pt < left_contexts; pt++)
    L[pt] = eob > 0;
-  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
+  for (pt = left_contexts; pt < (1 << ss_tx_size); pt++)
    L[pt] = 0;
 }

--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -22,11 +22,12 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))

-#define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))

-#define ALIGN_POWER_OF_TWO(value, n) \
-    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+/* If we don't want to use ROUND_POWER_OF_TWO macro
+static INLINE int16_t round_power_of_two(int16_t value, int n) {
+  return (value + (1 << (n - 1))) >> n;
+}*/

 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src) {            \
@@ -55,35 +56,10 @@ static INLINE double fclamp(double value, double low, double high) {
  return value < low ? low : (value > high ? high : value);
 }

-static int get_unsigned_bits(unsigned int num_values) {
-  int cat = 0;
-  if (num_values <= 1)
-    return 0;
-  num_values--;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
+static INLINE int multiple8(int value) {
+  return (value + 7) & ~7;
 }

-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr) do { \
-  lval = (expr); \
-  if (!lval) \
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
-                       "Failed to allocate "#lval" at %s:%d", \
-                       __FILE__, __LINE__); \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr) do { \
-  lval = (expr); \
-  if (!lval) \
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
-                       "Failed to allocate "#lval); \
-  } while (0)
-#endif
-
 #define SYNC_CODE_0 0x49
 #define SYNC_CODE_1 0x83
 #define SYNC_CODE_2 0x42
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -1,124 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_common_data.h"
-
-// Log 2 conversion lookup tables for block width and height
-const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
-  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
-const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
-  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
-  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
-  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
-// Log 2 conversion lookup tables for modeinfo width and height
-const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
-  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
-  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
-  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
-const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
-  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
-
-// MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
-const int size_group_lookup[BLOCK_SIZE_TYPES] =
-  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
-
-const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
-  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
-
-
-const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
-  {  // 4X4
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID
-  }, {  // 8X8
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 16X16
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 32X32
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
-    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 64X64
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
-    PARTITION_NONE
-  }
-};
-
-const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
-  {     // PARTITION_NONE
-    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
-    BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
-    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
-    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-    BLOCK_64X64,
-  }, {  // PARTITION_HORZ
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_64X32,
-  }, {  // PARTITION_VERT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_32X64,
-  }, {  // PARTITION_SPLIT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_32X32,
-  }
-};
-
-const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
-  TX_16X16, TX_16X16, TX_16X16,
-  TX_32X32, TX_32X32, TX_32X32, TX_32X32
-};
-const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
-  TX_16X16, TX_16X16, TX_16X16, TX_32X32
-};
-
-const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
-  { BLOCK_4X4,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8 },
-  { BLOCK_8X4,   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X16,  BLOCK_8X16 },
-  { BLOCK_16X8,  BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
-  { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
-  { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
-};
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_COMMON_DATA_H_
-#define VP9_COMMON_VP9_COMMON_DATA_H_
-
-#include "vp9/common/vp9_enums.h"
-
-extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int size_group_lookup[BLOCK_SIZE_TYPES];
-extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
-extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5];
-
-#endif    // VP9_COMMON_VP9_COMMON_DATA_H
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -38,8 +38,8 @@
 */
 #define ALIGN_FILTERS_256 1

-static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
+static void convolve_horiz_c(const uint8_t *src, int src_stride,
+                             uint8_t *dst, int dst_stride,
                             const int16_t *filter_x0, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h, int taps) {
@@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  }
 }

-static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
+static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
                                 const int16_t *filter_x0, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int taps) {
@@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  }
 }

-static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
+static void convolve_vert_c(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y0, int y_step_q4,
                            int w, int h, int taps) {
@@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  }
 }

-static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
+static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y0, int y_step_q4,
                                int w, int h, int taps) {
@@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  }
 }

-static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, ptrdiff_t dst_stride,
+static void convolve_c(const uint8_t *src, int src_stride,
+                       uint8_t *dst, int dst_stride,
                       const int16_t *filter_x, int x_step_q4,
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h, int taps) {
@@ -217,13 +217,12 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
   * h == 64, taps == 8.
   */
  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;

  assert(w <= 64);
  assert(h <= 64);
  assert(taps <= 8);
  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);

  if (intermediate_height < h)
    intermediate_height = h;
@@ -237,8 +236,8 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
                  w, h, taps);
 }

-static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
+static void convolve_avg_c(const uint8_t *src, int src_stride,
+                           uint8_t *dst, int dst_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h, int taps) {
@@ -247,13 +246,12 @@ static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
   * h == 64, taps == 8.
   */
  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;

  assert(w <= 64);
  assert(h <= 64);
  assert(taps <= 8);
  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);

  if (intermediate_height < h)
    intermediate_height = h;
@@ -267,8 +265,8 @@ static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                      w, h, taps);
 }

-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
+void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
+                           uint8_t *dst, int dst_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
@@ -277,8 +275,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                   w, h, 8);
 }

-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@@ -287,8 +285,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                       w, h, 8);
 }

-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
+void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
@@ -297,8 +295,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                  w, h, 8);
 }

-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
+void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
@@ -307,8 +305,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                      w, h, 8);
 }

-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
+void vp9_convolve8_c(const uint8_t *src, int src_stride,
+                     uint8_t *dst, int dst_stride,
                     const int16_t *filter_x, int x_step_q4,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
@@ -317,8 +315,8 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
             w, h, 8);
 }

-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
+void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
+                         uint8_t *dst, int dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
@@ -339,25 +337,33 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                   w, h);
 }

-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int filter_x_stride,
-                         const int16_t *filter_y, int filter_y_stride,
-                         int w, int h) {
-  int r;
+void vp9_convolve_copy(const uint8_t *src, int src_stride,
+                       uint8_t *dst, int dst_stride,
+                       const int16_t *filter_x, int filter_x_stride,
+                       const int16_t *filter_y, int filter_y_stride,
+                       int w, int h) {
+  if (w == 16 && h == 16) {
+    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
+  } else if (w == 8 && h == 8) {
+    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
+  } else if (w == 8 && h == 4) {
+    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
+  } else {
+    int r;

-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
-    src += src_stride;
-    dst += dst_stride;
+    for (r = h; r > 0; --r) {
+      memcpy(dst, src, w);
+      src += src_stride;
+      dst += dst_stride;
+    }
  }
 }

-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int filter_x_stride,
-                        const int16_t *filter_y, int filter_y_stride,
-                        int w, int h) {
+void vp9_convolve_avg(const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride,
+                      const int16_t *filter_x, int filter_x_stride,
+                      const int16_t *filter_y, int filter_y_stride,
+                      int w, int h) {
  int x, y;

  for (y = 0; y < h; ++y) {
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -13,12 +13,26 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"

-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h);

+// Not a convolution, a block copy conforming to the convolution prototype
+void vp9_convolve_copy(const uint8_t *src, int src_stride,
+                       uint8_t *dst, int dst_stride,
+                       const int16_t *filter_x, int x_step_q4,
+                       const int16_t *filter_y, int y_step_q4,
+                       int w, int h);
+
+// Not a convolution, a block average conforming to the convolution prototype
+void vp9_convolve_avg(const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride,
+                      const int16_t *filter_x, int x_step_q4,
+                      const int16_t *filter_y, int y_step_q4,
+                      int w, int h);
+
 struct subpix_fn_table {
  const int16_t (*filter_x)[8];
  const int16_t (*filter_y)[8];
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -8,7 +8,695 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+
 /*Generated file, included by vp9_entropy.c*/
+
+#if CONFIG_BALANCED_COEFTREE
+static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {   6, 213, 178 },
+        {  26, 113, 132 },
+        {  34,  17,  68 }
+      }, { /* Coeff Band 1 */
+        {  66,  96, 178 },
+        {  63,  96, 174 },
+        {  67,  54, 154 },
+        {  62,  28, 126 },
+        {  48,   9,  84 },
+        {  20,   1,  32 }
+      }, { /* Coeff Band 2 */
+        {  64, 144, 206 },
+        {  70,  99, 191 },
+        {  69,  36, 152 },
+        {  55,   9, 106 },
+        {  35,   1,  60 },
+        {  14,   1,  22 }
+      }, { /* Coeff Band 3 */
+        {  82, 154, 222 },
+        {  83, 112, 205 },
+        {  81,  31, 164 },
+        {  62,   7, 118 },
+        {  42,   1,  74 },
+        {  18,   1,  30 }
+      }, { /* Coeff Band 4 */
+        {  52, 179, 233 },
+        {  64, 132, 214 },
+        {  73,  36, 170 },
+        {  59,   8, 116 },
+        {  38,   1,  65 },
+        {  15,   1,  26 }
+      }, { /* Coeff Band 5 */
+        {  29, 175, 238 },
+        {  26, 169, 223 },
+        {  41,  80, 182 },
+        {  39,  32, 127 },
+        {  26,  10,  69 },
+        {  11,   2,  28 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  21, 226, 234 },
+        {  52, 182, 212 },
+        {  80, 112, 177 }
+      }, { /* Coeff Band 1 */
+        { 111, 164, 243 },
+        {  88, 152, 231 },
+        {  90,  43, 186 },
+        {  70,  12, 132 },
+        {  44,   2,  76 },
+        {  19,   1,  33 }
+      }, { /* Coeff Band 2 */
+        {  96, 185, 246 },
+        {  99, 127, 231 },
+        {  88,  21, 177 },
+        {  64,   5, 122 },
+        {  38,   1,  69 },
+        {  18,   1,  30 }
+      }, { /* Coeff Band 3 */
+        {  84, 206, 249 },
+        {  94, 147, 237 },
+        {  95,  33, 187 },
+        {  71,   8, 131 },
+        {  47,   1,  83 },
+        {  26,   1,  44 }
+      }, { /* Coeff Band 4 */
+        {  38, 221, 252 },
+        {  58, 177, 241 },
+        {  78,  46, 188 },
+        {  59,   9, 122 },
+        {  34,   1,  66 },
+        {  18,   1,  34 }
+      }, { /* Coeff Band 5 */
+        {  21, 216, 253 },
+        {  21, 206, 244 },
+        {  42,  93, 200 },
+        {  43,  41, 146 },
+        {  36,  13,  93 },
+        {  31,   1,  55 }
+      }
+    }
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {   7, 213, 219 },
+        {  23, 139, 182 },
+        {  38,  60, 125 }
+      }, { /* Coeff Band 1 */
+        {  69, 156, 220 },
+        {  52, 178, 213 },
+        {  69, 111, 190 },
+        {  69,  58, 155 },
+        {  58,  21, 104 },
+        {  39,   7,  60 }
+      }, { /* Coeff Band 2 */
+        {  68, 189, 228 },
+        {  70, 158, 221 },
+        {  83,  64, 189 },
+        {  73,  18, 141 },
+        {  48,   4,  88 },
+        {  23,   1,  41 }
+      }, { /* Coeff Band 3 */
+        {  99, 194, 236 },
+        {  91, 138, 224 },
+        {  91,  53, 189 },
+        {  74,  20, 142 },
+        {  48,   6,  90 },
+        {  22,   1,  41 }
+      }, { /* Coeff Band 4 */
+        {  52, 203, 244 },
+        {  60, 168, 231 },
+        {  75,  62, 189 },
+        {  61,  18, 132 },
+        {  38,   4,  72 },
+        {  17,   1,  39 }
+      }, { /* Coeff Band 5 */
+        {  33, 192, 247 },
+        {  31, 185, 234 },
+        {  46,  85, 185 },
+        {  39,  35, 132 },
+        {  28,  15,  80 },
+        {  13,   5,  38 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {   5, 247, 246 },
+        {  28, 209, 228 },
+        {  65, 137, 203 }
+      }, { /* Coeff Band 1 */
+        {  69, 208, 250 },
+        {  54, 207, 242 },
+        {  81,  92, 204 },
+        {  70,  54, 153 },
+        {  58,  40, 108 },
+        {  58,  35,  71 }
+      }, { /* Coeff Band 2 */
+        {  65, 215, 250 },
+        {  72, 185, 239 },
+        {  92,  50, 197 },
+        {  75,  14, 147 },
+        {  49,   2,  99 },
+        {  26,   1,  53 }
+      }, { /* Coeff Band 3 */
+        {  70, 220, 251 },
+        {  76, 186, 241 },
+        {  90,  65, 198 },
+        {  75,  26, 151 },
+        {  58,  12, 112 },
+        {  34,   6,  49 }
+      }, { /* Coeff Band 4 */
+        {  34, 224, 253 },
+        {  44, 204, 245 },
+        {  69,  85, 204 },
+        {  64,  31, 150 },
+        {  44,   2,  78 },
+        {   1,   1, 128 }
+      }, { /* Coeff Band 5 */
+        {  25, 216, 253 },
+        {  21, 215, 248 },
+        {  47, 108, 214 },
+        {  47,  48, 160 },
+        {  26,  20,  90 },
+        {  64, 171, 128 }
+      }
+    }
+  }
+};
+static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {   9, 203, 199 },
+        {  26,  92, 128 },
+        {  28,  11,  55 }
+      }, { /* Coeff Band 1 */
+        {  99,  54, 160 },
+        {  78,  99, 155 },
+        {  80,  44, 138 },
+        {  71,  17, 115 },
+        {  51,   5,  80 },
+        {  27,   1,  40 }
+      }, { /* Coeff Band 2 */
+        { 135,  81, 190 },
+        { 113,  61, 182 },
+        {  93,  16, 153 },
+        {  70,   4, 115 },
+        {  41,   1,  68 },
+        {  16,   1,  27 }
+      }, { /* Coeff Band 3 */
+        { 155, 103, 214 },
+        { 129,  48, 199 },
+        {  95,  10, 159 },
+        {  63,   1, 110 },
+        {  32,   1,  58 },
+        {  12,   1,  21 }
+      }, { /* Coeff Band 4 */
+        { 163, 149, 231 },
+        { 137,  69, 213 },
+        {  95,  11, 164 },
+        {  62,   3, 108 },
+        {  32,   1,  57 },
+        {  13,   1,  22 }
+      }, { /* Coeff Band 5 */
+        { 136, 189, 239 },
+        { 123, 102, 223 },
+        {  97,  19, 170 },
+        {  66,   4, 111 },
+        {  38,   1,  60 },
+        {  18,   1,  26 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  24, 226, 244 },
+        {  54, 178, 211 },
+        {  80,  74, 152 }
+      }, { /* Coeff Band 1 */
+        { 145, 153, 236 },
+        { 101, 163, 223 },
+        { 108,  50, 187 },
+        {  90,  22, 145 },
+        {  66,   8,  97 },
+        {  42,   4,  50 }
+      }, { /* Coeff Band 2 */
+        { 150, 159, 238 },
+        { 128,  90, 218 },
+        {  94,   9, 163 },
+        {  64,   3, 110 },
+        {  34,   1,  61 },
+        {  13,   1,  24 }
+      }, { /* Coeff Band 3 */
+        { 151, 162, 242 },
+        { 135,  80, 222 },
+        {  93,   9, 166 },
+        {  61,   3, 111 },
+        {  31,   1,  59 },
+        {  12,   1,  22 }
+      }, { /* Coeff Band 4 */
+        { 161, 170, 245 },
+        { 140,  84, 228 },
+        {  99,   8, 174 },
+        {  64,   1, 116 },
+        {  34,   1,  63 },
+        {  14,   1,  26 }
+      }, { /* Coeff Band 5 */
+        { 138, 197, 246 },
+        { 127, 109, 233 },
+        { 100,  16, 179 },
+        {  66,   3, 119 },
+        {  37,   1,  66 },
+        {  16,   1,  30 }
+      }
+    }
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {   6, 216, 212 },
+        {  25, 134, 171 },
+        {  43,  48, 118 }
+      }, { /* Coeff Band 1 */
+        {  93, 112, 209 },
+        {  66, 159, 206 },
+        {  82,  78, 184 },
+        {  75,  28, 148 },
+        {  46,   4,  82 },
+        {  18,   1,  28 }
+      }, { /* Coeff Band 2 */
+        { 108, 148, 220 },
+        {  90, 130, 216 },
+        {  92,  40, 186 },
+        {  73,  10, 135 },
+        {  46,   1,  79 },
+        {  20,   1,  35 }
+      }, { /* Coeff Band 3 */
+        { 125, 173, 232 },
+        { 109, 117, 223 },
+        {  97,  31, 183 },
+        {  71,   7, 127 },
+        {  44,   1,  76 },
+        {  21,   1,  36 }
+      }, { /* Coeff Band 4 */
+        { 133, 195, 236 },
+        { 112, 121, 224 },
+        {  97,  23, 178 },
+        {  69,   3, 122 },
+        {  42,   1,  72 },
+        {  19,   1,  34 }
+      }, { /* Coeff Band 5 */
+        { 132, 180, 238 },
+        { 119, 102, 225 },
+        { 101,  18, 179 },
+        {  71,   3, 124 },
+        {  42,   1,  70 },
+        {  17,   1,  28 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {   5, 242, 250 },
+        {  26, 198, 226 },
+        {  58,  98, 168 }
+      }, { /* Coeff Band 1 */
+        {  82, 201, 246 },
+        {  50, 219, 237 },
+        {  94, 107, 205 },
+        {  89,  61, 167 },
+        {  77,  31, 131 },
+        {  57,  14,  91 }
+      }, { /* Coeff Band 2 */
+        {  99, 202, 247 },
+        {  96, 165, 234 },
+        { 100,  31, 190 },
+        {  72,   8, 131 },
+        {  41,   1,  72 },
+        {  14,   1,  24 }
+      }, { /* Coeff Band 3 */
+        { 108, 204, 248 },
+        { 107, 156, 235 },
+        { 103,  27, 186 },
+        {  71,   4, 124 },
+        {  39,   1,  66 },
+        {  14,   1,  19 }
+      }, { /* Coeff Band 4 */
+        { 120, 211, 248 },
+        { 118, 149, 234 },
+        { 107,  19, 182 },
+        {  72,   3, 126 },
+        {  40,   1,  69 },
+        {  16,   1,  24 }
+      }, { /* Coeff Band 5 */
+        { 127, 199, 245 },
+        { 122, 125, 232 },
+        { 112,  20, 186 },
+        {  82,   3, 136 },
+        {  55,   1,  88 },
+        {  10,   1,  38 }
+      }
+    }
+  }
+};
+static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {  25,   9, 101 },
+        {  25,   2,  67 },
+        {  15,   1,  28 }
+      }, { /* Coeff Band 1 */
+        {  67,  30, 118 },
+        {  61,  56, 116 },
+        {  60,  31, 105 },
+        {  52,  11,  85 },
+        {  34,   2,  54 },
+        {  14,   1,  22 }
+      }, { /* Coeff Band 2 */
+        { 107,  58, 149 },
+        {  92,  53, 147 },
+        {  78,  14, 123 },
+        {  56,   3,  87 },
+        {  35,   1,  56 },
+        {  17,   1,  27 }
+      }, { /* Coeff Band 3 */
+        { 142,  61, 171 },
+        { 111,  30, 162 },
+        {  80,   4, 128 },
+        {  53,   1,  87 },
+        {  31,   1,  52 },
+        {  14,   1,  24 }
+      }, { /* Coeff Band 4 */
+        { 171,  73, 200 },
+        { 129,  28, 184 },
+        {  86,   3, 140 },
+        {  54,   1,  90 },
+        {  28,   1,  49 },
+        {  12,   1,  21 }
+      }, { /* Coeff Band 5 */
+        { 193, 129, 227 },
+        { 148,  28, 200 },
+        {  90,   2, 144 },
+        {  53,   1,  90 },
+        {  28,   1,  50 },
+        {  13,   1,  22 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  60,   7, 234 },
+        {  64,   4, 184 },
+        {  56,   1, 104 }
+      }, { /* Coeff Band 1 */
+        { 150, 111, 210 },
+        {  87, 185, 202 },
+        { 101,  81, 177 },
+        {  90,  34, 142 },
+        {  67,  11,  95 },
+        {  38,   2,  51 }
+      }, { /* Coeff Band 2 */
+        { 153, 139, 218 },
+        { 120,  72, 195 },
+        {  90,  11, 147 },
+        {  63,   3, 101 },
+        {  39,   1,  61 },
+        {  20,   1,  33 }
+      }, { /* Coeff Band 3 */
+        { 171, 132, 223 },
+        { 131,  56, 200 },
+        {  92,   6, 147 },
+        {  58,   1,  95 },
+        {  32,   1,  52 },
+        {  14,   1,  23 }
+      }, { /* Coeff Band 4 */
+        { 183, 137, 227 },
+        { 139,  48, 204 },
+        {  91,   3, 148 },
+        {  55,   1,  91 },
+        {  28,   1,  47 },
+        {  13,   1,  21 }
+      }, { /* Coeff Band 5 */
+        { 198, 149, 234 },
+        { 153,  32, 208 },
+        {  95,   2, 148 },
+        {  55,   1,  90 },
+        {  30,   1,  51 },
+        {  16,   1,  25 }
+      }
+    }
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {   7, 209, 217 },
+        {  31, 106, 151 },
+        {  40,  21,  86 }
+      }, { /* Coeff Band 1 */
+        { 101,  71, 184 },
+        {  74, 131, 177 },
+        {  88,  50, 158 },
+        {  78,  16, 129 },
+        {  51,   2,  82 },
+        {  18,   1,  29 }
+      }, { /* Coeff Band 2 */
+        { 116, 115, 199 },
+        { 102,  88, 191 },
+        {  94,  22, 160 },
+        {  74,   6, 122 },
+        {  47,   1,  77 },
+        {  18,   1,  30 }
+      }, { /* Coeff Band 3 */
+        { 157, 124, 210 },
+        { 130,  53, 201 },
+        { 102,  10, 165 },
+        {  73,   1, 120 },
+        {  42,   1,  69 },
+        {  16,   1,  27 }
+      }, { /* Coeff Band 4 */
+        { 174, 147, 225 },
+        { 134,  67, 212 },
+        { 100,  10, 168 },
+        {  66,   1, 111 },
+        {  36,   1,  60 },
+        {  16,   1,  27 }
+      }, { /* Coeff Band 5 */
+        { 185, 165, 232 },
+        { 147,  56, 214 },
+        { 105,   5, 165 },
+        {  66,   1, 108 },
+        {  35,   1,  59 },
+        {  16,   1,  27 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {   3, 232, 245 },
+        {  18, 162, 210 },
+        {  38,  64, 131 }
+      }, { /* Coeff Band 1 */
+        {  84, 187, 239 },
+        {  35, 231, 231 },
+        {  82, 150, 209 },
+        {  87,  97, 181 },
+        {  81,  64, 151 },
+        {  67,  60, 119 }
+      }, { /* Coeff Band 2 */
+        { 107, 185, 239 },
+        { 100, 149, 224 },
+        { 107,  34, 185 },
+        {  83,  12, 141 },
+        {  49,   4,  92 },
+        {  21,   1,  40 }
+      }, { /* Coeff Band 3 */
+        { 125, 184, 243 },
+        { 121, 127, 228 },
+        { 113,  25, 185 },
+        {  82,   6, 134 },
+        {  48,   1,  82 },
+        {  26,   1,  38 }
+      }, { /* Coeff Band 4 */
+        { 143, 185, 245 },
+        { 133, 115, 231 },
+        { 114,  14, 184 },
+        {  77,   3, 126 },
+        {  43,   1,  68 },
+        {  34,   1,  40 }
+      }, { /* Coeff Band 5 */
+        { 170, 194, 241 },
+        { 151,  80, 226 },
+        { 118,   9, 180 },
+        {  81,   1, 130 },
+        {  51,   1,  78 },
+        {  18,   1,  49 }
+      }
+    }
+  }
+};
+static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {  29,  42, 137 },
+        {  26,   3,  60 },
+        {  13,   1,  23 }
+      }, { /* Coeff Band 1 */
+        {  69,  36, 122 },
+        {  63,  57, 123 },
+        {  60,  33, 112 },
+        {  52,  11,  90 },
+        {  32,   2,  52 },
+        {  10,   1,  15 }
+      }, { /* Coeff Band 2 */
+        { 107,  55, 143 },
+        {  86,  69, 143 },
+        {  74,  24, 116 },
+        {  52,   5,  78 },
+        {  29,   1,  44 },
+        {  12,   1,  18 }
+      }, { /* Coeff Band 3 */
+        { 137,  71, 160 },
+        { 107,  34, 152 },
+        {  73,   6, 114 },
+        {  44,   1,  69 },
+        {  25,   1,  40 },
+        {  12,   1,  18 }
+      }, { /* Coeff Band 4 */
+        { 165,  70, 174 },
+        { 118,  24, 159 },
+        {  74,   3, 117 },
+        {  45,   1,  73 },
+        {  26,   1,  43 },
+        {  12,   1,  19 }
+      }, { /* Coeff Band 5 */
+        { 220,  93, 223 },
+        { 153,  10, 187 },
+        {  86,   2, 131 },
+        {  49,   1,  79 },
+        {  26,   1,  43 },
+        {  12,   1,  20 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  30,  58, 227 },
+        {  35,  10, 172 },
+        {  24,  23, 112 }
+      }, { /* Coeff Band 1 */
+        { 117, 145, 219 },
+        {  51, 221, 216 },
+        {  75, 169, 196 },
+        {  88,  96, 165 },
+        {  77,  43, 117 },
+        {  53,  18,  60 }
+      }, { /* Coeff Band 2 */
+        { 128, 176, 225 },
+        { 108, 114, 202 },
+        {  92,  19, 152 },
+        {  65,   4, 103 },
+        {  38,   1,  61 },
+        {  19,   1,  30 }
+      }, { /* Coeff Band 3 */
+        { 146, 184, 228 },
+        { 122,  95, 205 },
+        {  92,  11, 149 },
+        {  62,   1,  98 },
+        {  35,   1,  57 },
+        {  17,   1,  26 }
+      }, { /* Coeff Band 4 */
+        { 165, 192, 230 },
+        { 132,  81, 206 },
+        {  93,   6, 147 },
+        {  58,   1,  94 },
+        {  32,   1,  52 },
+        {  15,   1,  24 }
+      }, { /* Coeff Band 5 */
+        { 204, 223, 234 },
+        { 156,  49, 204 },
+        {  97,   3, 145 },
+        {  59,   1,  92 },
+        {  33,   1,  52 },
+        {  15,   1,  24 }
+      }
+    }
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {   7, 184, 200 },
+        {  25,  67, 113 },
+        {  30,   9,  59 }
+      }, { /* Coeff Band 1 */
+        {  92,  42, 158 },
+        {  65, 121, 159 },
+        {  77,  56, 146 },
+        {  70,  22, 120 },
+        {  47,   4,  76 },
+        {  18,   1,  26 }
+      }, { /* Coeff Band 2 */
+        { 113,  81, 177 },
+        {  96,  75, 167 },
+        {  84,  24, 136 },
+        {  63,   8, 100 },
+        {  37,   1,  58 },
+        {  13,   1,  19 }
+      }, { /* Coeff Band 3 */
+        { 147,  85, 194 },
+        { 119,  36, 178 },
+        {  88,   8, 139 },
+        {  59,   1,  93 },
+        {  31,   1,  49 },
+        {  10,   1,  18 }
+      }, { /* Coeff Band 4 */
+        { 169, 108, 210 },
+        { 131,  41, 191 },
+        {  92,   5, 144 },
+        {  56,   1,  88 },
+        {  29,   1,  47 },
+        {  14,   1,  22 }
+      }, { /* Coeff Band 5 */
+        { 210, 106, 223 },
+        { 148,  14, 192 },
+        {  89,   2, 138 },
+        {  52,   1,  84 },
+        {  29,   1,  47 },
+        {  14,   1,  23 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {   3, 207, 245 },
+        {  12, 102, 213 },
+        {  18,  33, 144 }
+      }, { /* Coeff Band 1 */
+        {  85, 205, 245 },
+        {  18, 249, 242 },
+        {  59, 221, 229 },
+        {  91, 166, 213 },
+        {  88, 117, 183 },
+        {  70,  95, 149 }
+      }, { /* Coeff Band 2 */
+        { 114, 193, 241 },
+        { 104, 155, 221 },
+        { 100,  33, 181 },
+        {  78,  10, 132 },
+        {  43,   2,  75 },
+        {  15,   1,  48 }
+      }, { /* Coeff Band 3 */
+        { 118, 198, 244 },
+        { 117, 142, 224 },
+        { 111,  25, 179 },
+        {  83,   4, 134 },
+        {  57,   1,  84 },
+        {   1,   1,   1 }
+      }, { /* Coeff Band 4 */
+        { 144, 201, 248 },
+        { 136, 130, 234 },
+        { 124,  12, 188 },
+        {  83,   1, 130 },
+        {  61,   1,  66 },
+        {  64, 171, 128 }
+      }, { /* Coeff Band 5 */
+        { 174, 227, 250 },
+        { 165, 118, 242 },
+        { 132,  21, 197 },
+        {  84,   3, 134 },
+        {  70,   1,  69 },
+        {   1,   1,   1 }
+      }
+    }
+  }
+};
+#else
 static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
  { /* block Type 0 */
    { /* Intra */
@@ -693,4 +1381,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
    }
  }
 };
-
+#endif
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -15,8 +15,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"

-#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
-
 DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -52,28 +50,28 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
  0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = {
  0,  4,  1,  5,
  8,  2, 12,  9,
  3,  6, 13, 10,
  7, 14, 11, 15,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {
  0,  4,  8,  1,
  12,  5,  9,  2,
  13,  6, 10,  3,
  7, 14, 11, 15,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {
  0,  1,  4,  2,
  5,  3,  6,  8,
  9,  7, 12, 10,
  13, 11, 14, 15,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {
  0,  8,  1, 16,  9,  2, 17, 24,
  10,  3, 18, 25, 32, 11,  4, 26,
  33, 19, 40, 12, 34, 27,  5, 41,
@@ -84,7 +82,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
  46, 39, 61, 54, 47, 62, 55, 63,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {
  0,  8, 16,  1, 24,  9, 32, 17,
  2, 40, 25, 10, 33, 18, 48,  3,
  26, 41, 11, 56, 19, 34,  4, 49,
@@ -95,7 +93,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
  31, 61, 39, 54, 47, 62, 55, 63,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {
  0,  1,  2,  8,  9,  3, 16, 10,
  4, 17, 11, 24,  5, 18, 25, 12,
  19, 26, 32,  6, 13, 20, 33, 27,
@@ -106,7 +104,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
  60, 39, 61, 47, 54, 55, 62, 63,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {
  0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,
  50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,
  98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,
@@ -125,7 +123,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {
  0,  16,  32,  48,   1,  64,  17,  80,  33,  96,  49,   2,  65, 112,  18,  81,
  34, 128,  50,  97,   3,  66, 144,  19, 113,  35,  82, 160,  98,  51, 129,   4,
  67, 176,  20, 114, 145,  83,  36,  99, 130,  52, 192,   5, 161,  68, 115,  21,
@@ -144,7 +142,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {
  0,   1,   2,  16,   3,  17,   4,  18,  32,   5,  33,  19,   6,  34,  48,  20,
  49,   7,  35,  21,  50,  64,   8,  36,  65,  22,  51,  37,  80,   9,  66,  52,
  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,  83,  97,  69,
@@ -163,7 +161,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
 };

-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
+DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {
  0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,
  225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,
  71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,
@@ -202,8 +200,13 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {

 const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
 {
+#if CONFIG_BALANCED_COEFTREE
+  -ZERO_TOKEN, 2,                             /* 0 = ZERO */
+  -DCT_EOB_TOKEN, 4,                          /* 1 = EOB  */
+#else
  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
+#endif
  -ONE_TOKEN, 6,                              /* 2 = ONE */
  8, 12,                                      /* 3 = LOW_VAL */
  -TWO_TOKEN, 10,                            /* 4 = TWO */
@@ -230,8 +233,13 @@ static const vp9_prob Pcat6[] = {
 };

 const vp9_tree_index vp9_coefmodel_tree[6] = {
+#if CONFIG_BALANCED_COEFTREE
+  -ZERO_TOKEN, 2,
+  -DCT_EOB_MODEL_TOKEN, 4,
+#else
  -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
  -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+#endif
  -ONE_TOKEN, -TWO_TOKEN,
 };

@@ -244,7 +252,7 @@ const vp9_tree_index vp9_coefmodel_tree[6] = {
 // the probabilities for the rest of the nodes.

 // beta = 8
-static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
+const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
  {  3,  86, 128,   6,  86,  23,  88,  29},
  {  9,  86, 129,  17,  88,  61,  94,  76},
  { 15,  87, 129,  28,  89,  93, 100, 110},
@@ -378,7 +386,8 @@ static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
 static void extend_model_to_full_distribution(vp9_prob p,
                                              vp9_prob *tree_probs) {
  const int l = ((p - 1) / 2);
-  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
+  const vp9_prob (*model)[MODEL_NODES];
+  model = vp9_modelcoefprobs_pareto8;
  if (p & 1) {
    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
               model[l], MODEL_NODES * sizeof(vp9_prob));
@@ -397,6 +406,16 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
  extend_model_to_full_distribution(model[PIVOT_NODE], full);
 }

+void vp9_model_to_full_probs_sb(
+    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
+    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+  int c, p;
+  for (c = 0; c < COEF_BANDS; ++c)
+    for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {
+      vp9_model_to_full_probs(model[c][p], full[c][p]);
+    }
+}
+
 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];

 static void init_bit_tree(vp9_tree_index *p, int n) {
@@ -419,7 +438,7 @@ static void init_bit_trees() {
  init_bit_tree(cat6, 14);
 }

-const vp9_extra_bit vp9_extra_bits[12] = {
+vp9_extra_bit vp9_extra_bits[12] = {
  { 0, 0, 0, 0},
  { 0, 0, 0, 1},
  { 0, 0, 0, 2},
@@ -436,50 +455,69 @@ const vp9_extra_bit vp9_extra_bits[12] = {

 #include "vp9/common/vp9_default_coef_probs.h"

+// This function updates and then returns n AC coefficient context
+// This is currently a placeholder function to allow experimentation
+// using various context models based on the energy earlier tokens
+// within the current block.
+//
+// For now it just returns the previously used context.
+#define MAX_NEIGHBORS 2
+int vp9_get_coef_context(const int *scan, const int *neighbors,
+                         int nb_pad, uint8_t *token_cache, int c, int l) {
+  int eob = l;
+  assert(nb_pad == MAX_NEIGHBORS);
+  if (c == eob) {
+    return 0;
+  } else {
+    int ctx;
+    assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
+    if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
+      ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
+             token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
+    } else {
+      ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
+    }
+    return ctx;
+  }
+};
+
 void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+  vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
+             sizeof(pc->fc.coef_probs[TX_4X4]));
+  vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8,
+             sizeof(pc->fc.coef_probs[TX_8X8]));
+  vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16,
+             sizeof(pc->fc.coef_probs[TX_16X16]));
+  vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32,
+             sizeof(pc->fc.coef_probs[TX_32X32]));
 }

 // Neighborhood 5-tuples for various scans and blocksizes,
 // in {top, left, topleft, topright, bottomleft} order
 // for each position in raster scan order.
 // -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);

-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-static int find_in_scan(const int16_t *scan, int l, int idx) {
+static int find_in_scan(const int *scan, int l, int idx) {
  int n, l2 = l * l;
  for (n = 0; n < l2; n++) {
    int rc = scan[n];
@@ -489,19 +527,14 @@ static int find_in_scan(const int16_t *scan, int l, int idx) {
  assert(0);
  return -1;
 }
-static void init_scan_neighbors(const int16_t *scan,
-                                int16_t *iscan,
-                                int l, int16_t *neighbors) {
+static void init_scan_neighbors(const int *scan, int l, int *neighbors,
+                                int max_neighbors) {
  int l2 = l * l;
  int n, i, j;

-  // dc doesn't use this type of prediction
-  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
-  iscan[0] = find_in_scan(scan, l, 0);
-  for (n = 1; n < l2; n++) {
+  for (n = 0; n < l2; n++) {
    int rc = scan[n];
-    iscan[n] = find_in_scan(scan, l, n);
+    assert(max_neighbors == MAX_NEIGHBORS);
    i = rc / l;
    j = rc % l;
    if (i > 0 && j > 0) {
@@ -513,84 +546,93 @@ static void init_scan_neighbors(const int16_t *scan,
      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
      // as a context. If ADST or DCT is used in both directions, we
      // use the combination of the two as a context.
-      int a = (i - 1) * l + j;
-      int b =  i      * l + j - 1;
+      int a = find_in_scan(scan, l, (i - 1) * l + j);
+      int b = find_in_scan(scan, l,  i      * l + j - 1);
      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
          scan == vp9_col_scan_16x16) {
-        // in the col/row scan cases (as well as left/top edge cases), we set
-        // both contexts to the same value, so we can branchlessly do a+b+1>>1
-        // which automatically becomes a if a == b
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = a;
+        neighbors[max_neighbors * n + 0] = a;
+        neighbors[max_neighbors * n + 1] = -1;
      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
                 scan == vp9_row_scan_16x16) {
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
+        neighbors[max_neighbors * n + 0] = b;
+        neighbors[max_neighbors * n + 1] = -1;
      } else {
-        neighbors[MAX_NEIGHBORS * n + 0] = a;
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
+        neighbors[max_neighbors * n + 0] = a;
+        neighbors[max_neighbors * n + 1] = b;
      }
    } else if (i > 0) {
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
+      neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j);
+      neighbors[max_neighbors * n + 1] = -1;
+    } else if (j > 0) {
+      neighbors[max_neighbors * n + 0] =
+          find_in_scan(scan, l,  i      * l + j - 1);
+      neighbors[max_neighbors * n + 1] = -1;
    } else {
-      assert(j > 0);
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
+      assert(n == 0);
+      // dc predictor doesn't use previous tokens
+      neighbors[max_neighbors * n + 0] = -1;
    }
-    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
+    assert(neighbors[max_neighbors * n + 0] < n);
  }
-  // one padding item so we don't have to add branches in code to handle
-  // calls to get_coef_context() for the token after the final dc token
-  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
 }

 void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
-                      vp9_default_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
-                      vp9_default_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
-                      vp9_row_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
-                      vp9_col_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
-                      vp9_default_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
-                      vp9_row_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
-                      vp9_col_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
-                      vp9_default_scan_32x32_neighbors);
+  init_scan_neighbors(vp9_default_scan_4x4, 4,
+                      vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_row_scan_4x4, 4,
+                      vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_col_scan_4x4, 4,
+                      vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_8x8, 8,
+                      vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_row_scan_8x8, 8,
+                      vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_col_scan_8x8, 8,
+                      vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_16x16, 16,
+                      vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_row_scan_16x16, 16,
+                      vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_col_scan_16x16, 16,
+                      vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_32x32, 32,
+                      vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);
 }

-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
+const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) {
  if (scan == vp9_default_scan_4x4) {
+    *pad = MAX_NEIGHBORS;
    return vp9_default_scan_4x4_neighbors;
  } else if (scan == vp9_row_scan_4x4) {
+    *pad = MAX_NEIGHBORS;
    return vp9_row_scan_4x4_neighbors;
  } else if (scan == vp9_col_scan_4x4) {
+    *pad = MAX_NEIGHBORS;
    return vp9_col_scan_4x4_neighbors;
  } else if (scan == vp9_default_scan_8x8) {
+    *pad = MAX_NEIGHBORS;
    return vp9_default_scan_8x8_neighbors;
  } else if (scan == vp9_row_scan_8x8) {
+    *pad = 2;
    return vp9_row_scan_8x8_neighbors;
  } else if (scan == vp9_col_scan_8x8) {
+    *pad = 2;
    return vp9_col_scan_8x8_neighbors;
  } else if (scan == vp9_default_scan_16x16) {
+    *pad = MAX_NEIGHBORS;
    return vp9_default_scan_16x16_neighbors;
  } else if (scan == vp9_row_scan_16x16) {
+    *pad = 2;
    return vp9_row_scan_16x16_neighbors;
  } else if (scan == vp9_col_scan_16x16) {
+    *pad = 2;
    return vp9_col_scan_16x16_neighbors;
-  } else {
-    assert(scan == vp9_default_scan_32x32);
+  } else if (scan == vp9_default_scan_32x32) {
+    *pad = MAX_NEIGHBORS;
    return vp9_default_scan_32x32_neighbors;
+  } else {
+    assert(0);
+    return NULL;
  }
 }

@@ -609,17 +651,40 @@ void vp9_coef_tree_initialize() {
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128

-static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
-                             unsigned int count_sat,
-                             unsigned int update_factor) {
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+void vp9_full_to_model_count(unsigned int *model_count,
+                             unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}

-  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
-  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
-  vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
+void vp9_full_to_model_counts(
+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
+  int i, j, k, l;
+  for (i = 0; i < BLOCK_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+          if (l >= 3 && k == 0)
+            continue;
+          vp9_full_to_model_count(model_count[i][j][k][l],
+                                  full_count[i][j][k][l]);
+        }
+}
+
+static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
+                             int count_sat, int update_factor) {
+  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
+  vp9_coeff_probs_model *pre_coef_probs = cm->fc.pre_coef_probs[txfm_size];
+  vp9_coeff_count_model *coef_counts = cm->fc.coef_counts[txfm_size];
  unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
-      cm->counts.eob_branch[tx_size];
-  int t, i, j, k, l;
+      cm->fc.eob_branch_counts[txfm_size];
+  int t, i, j, k, l, count;
+  int factor;
  unsigned int branch_ct[UNCONSTRAINED_NODES][2];
  vp9_prob coef_probs[UNCONSTRAINED_NODES];
  int entropy_nodes_adapt = UNCONSTRAINED_NODES;
@@ -630,23 +695,34 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
          if (l >= 3 && k == 0)
            continue;
-          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
-                                           branch_ct, coef_counts[i][j][k][l],
-                                           0);
+          vp9_tree_probs_from_distribution(
+              vp9_coefmodel_tree,
+              coef_probs, branch_ct,
+              coef_counts[i][j][k][l], 0);
+#if CONFIG_BALANCED_COEFTREE
+          branch_ct[1][1] = eob_branch_count[i][j][k][l] - branch_ct[1][0];
+          coef_probs[1] = get_binary_prob(branch_ct[1][0], branch_ct[1][1]);
+#else
          branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < entropy_nodes_adapt; ++t)
-            dst_coef_probs[i][j][k][l][t] = merge_probs(
-                pre_coef_probs[i][j][k][l][t], coef_probs[t],
-                branch_ct[t], count_sat, update_factor);
+#endif
+          for (t = 0; t < entropy_nodes_adapt; ++t) {
+            count = branch_ct[t][0] + branch_ct[t][1];
+            count = count > count_sat ? count_sat : count;
+            factor = (update_factor * count / count_sat);
+            dst_coef_probs[i][j][k][l][t] =
+                weighted_prob(pre_coef_probs[i][j][k][l][t],
+                              coef_probs[t], factor);
+          }
        }
 }

 void vp9_adapt_coef_probs(VP9_COMMON *cm) {
  TX_SIZE t;
-  unsigned int count_sat, update_factor;
+  int count_sat;
+  int update_factor; /* denominator 256 */

-  if (cm->frame_type == KEY_FRAME || cm->intra_only) {
+  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
    count_sat = COEF_COUNT_SAT_KEY;
  } else if (cm->last_frame_type == KEY_FRAME) {
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -50,7 +50,9 @@ typedef struct {
  int base_val;
 } vp9_extra_bit;

-extern const vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
+extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST   7

 #define MAX_PROB                255
 #define DCT_MAX_VALUE           16384
@@ -80,6 +82,7 @@ extern const vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
   coefficient band (and since zigzag positions 0, 1, and 2 are in
   distinct bands). */

+/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #define PREV_COEF_CONTEXTS          6

 // #define ENTROPY_STATS
@@ -96,62 +99,22 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]);

-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);

-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]);

-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]);

-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]);

-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]);

-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-#define MAX_NEIGHBORS 2
-
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]);

 void vp9_coef_tree_initialize(void);
 void vp9_adapt_coef_probs(struct VP9Common *);
@@ -185,14 +148,9 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) {
    ? (COEF_BANDS-1) : band_translate[coef_index];
 }

-static INLINE int get_coef_context(const int16_t *neighbors,
-                                   uint8_t *token_cache,
-                                   int c) {
-  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
-          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
-}
-
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
+extern int vp9_get_coef_context(const int *scan, const int *neighbors,
+                                int nb_pad, uint8_t *token_cache, int c, int l);
+const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);


 // 128 lists of probabilities are stored for the following ONE node probs:
@@ -202,6 +160,7 @@ const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
 #define COEFPROB_MODELS             128

 #define UNCONSTRAINED_NODES         3
+#define MODEL_NODES                 (ENTROPY_NODES - UNCONSTRAINED_NODES)

 #define PIVOT_NODE                  2   // which node is pivot

@@ -215,10 +174,20 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
 typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
                                          [PREV_COEF_CONTEXTS]
                                          [UNCONSTRAINED_NODES][2];
+extern void vp9_full_to_model_count(unsigned int *model_count,
+                                    unsigned int *full_count);
+extern void vp9_full_to_model_counts(
+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);

 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);

-static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
+void vp9_model_to_full_probs_sb(
+    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
+    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]);
+
+extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];
+
+static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
  switch (tx_type) {
    case ADST_DCT:
      return vp9_row_scan_4x4;
@@ -229,36 +198,7 @@ static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
  }
 }

-static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_4x4;
-      *nb = vp9_row_scan_4x4_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_4x4;
-      *nb = vp9_col_scan_4x4_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_4x4;
-      *nb = vp9_default_scan_4x4_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_4x4;
-    case DCT_ADST:
-      return vp9_col_iscan_4x4;
-    default:
-      return vp9_default_iscan_4x4;
-  }
-}
-
-static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
+static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
  switch (tx_type) {
    case ADST_DCT:
      return vp9_row_scan_8x8;
@@ -269,36 +209,7 @@ static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
  }
 }

-static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_8x8;
-      *nb = vp9_row_scan_8x8_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_8x8;
-      *nb = vp9_col_scan_8x8_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_8x8;
-      *nb = vp9_default_scan_8x8_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_8x8;
-    case DCT_ADST:
-      return vp9_col_iscan_8x8;
-    default:
-      return vp9_default_iscan_8x8;
-  }
-}
-
-static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
+static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
  switch (tx_type) {
    case ADST_DCT:
      return vp9_row_scan_16x16;
@@ -309,35 +220,6 @@ static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
  }
 }

-static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
-                                     const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_16x16;
-      *nb = vp9_row_scan_16x16_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_16x16;
-      *nb = vp9_col_scan_16x16_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_16x16;
-      *nb = vp9_default_scan_16x16_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_16x16;
-    case DCT_ADST:
-      return vp9_col_iscan_16x16;
-    default:
-      return vp9_default_iscan_16x16;
-  }
-}
-
 enum { VP9_COEF_UPDATE_PROB = 252 };

 #endif  // VP9_COMMON_VP9_ENTROPY_H_
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -8,14 +8,15 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_modecont.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_alloccommon.h"
 #include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_seg_common.h"
-
-const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
-                                  [VP9_INTRA_MODES - 1] = {
+static const vp9_prob default_kf_uv_probs[VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES - 1] = {
  { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
  { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
  { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
@@ -50,9 +51,8 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
  { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
 };

-static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
-                                             [NUM_PARTITION_CONTEXTS]
-                                             [PARTITION_TYPES - 1] = {
+const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
+                                  [PARTITION_TYPES - 1] = {
  { /* frame_type = keyframe */
    /* 8x8 -> 4x4 */
    { 158,  97,  94 } /* a/l both not split */,
@@ -98,143 +98,6 @@ static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
  }
 };

-const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES - 1] = {
-  { /* above = dc */
-    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
-    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
-    {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
-    {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
-    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
-    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
-    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
-    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
-    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
-  }, { /* above = v */
-    {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
-    {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
-    {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
-    {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
-    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
-    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
-    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
-    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
-    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
-  }, { /* above = h */
-    {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
-    {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
-    {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
-    {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
-    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
-    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
-    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
-    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
-    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
-  }, { /* above = d45 */
-    { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
-    {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
-    {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
-    {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
-    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
-    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
-    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
-    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
-    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
-  }, { /* above = d135 */
-    {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
-    {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
-    {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
-    {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
-    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
-    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
-    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
-    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
-    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
-  }, { /* above = d117 */
-    {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
-    {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
-    {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
-    {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
-    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
-    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
-    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
-    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
-    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
-  }, { /* above = d153 */
-    {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
-    {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
-    {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
-    {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
-    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
-    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
-    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
-    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
-    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-  }, { /* above = d27 */
-    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
-    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
-    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
-    {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
-    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
-    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
-    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
-    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
-    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
-  }, { /* above = d63 */
-    {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
-    {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
-    {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
-    {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
-    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
-    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
-    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
-    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
-    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
-  }, { /* above = tm */
-    {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
-    {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
-    {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
-    {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
-    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
-    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
-    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
-    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
-    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
-  }
-};
-
-#if CONFIG_FILTERINTRA
-const vp9_prob vp9_default_filterintra_prob[TX_SIZES][VP9_INTRA_MODES] = {
-  // DC   V    H    D45  D135 D117 D153 D27  D63  TM
-    {160, 153, 171, 160, 140, 117, 115, 160, 160, 116},  // TX_4X4
-    {180, 151, 191, 180, 118,  66,  97, 180, 180, 120},  // TX_8X8
-    {200, 200, 200, 200, 200, 200, 200, 200, 200, 200},  // TX_16X16
-    {220, 220, 220, 220, 220, 220, 220, 220, 220, 220},  // TX_32X32
-};
-#endif
-
-static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
-                                              [VP9_INTER_MODES - 1] = {
-  {2,       173,   34},  // 0 = both zero mv
-  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
-  {7,       166,   63},  // 2 = two predicted mvs
-  {7,       94,    66},  // 3 = one predicted/zero and one new mv
-  {8,       64,    46},  // 4 = two new mvs
-  {17,      81,    31},  // 5 = one intra neighbour + x
-  {25,      29,    30},  // 6 = two intra neighbours
-};
-
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
 const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
  -DC_PRED, 2,                      /* 0 = DC_NODE */
@@ -248,7 +111,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
 };

-const vp9_tree_index vp9_inter_mode_tree[6] = {
+const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
  -ZEROMV, 2,
  -NEARESTMV, 4,
  -NEARMV, -NEWMV
@@ -261,7 +124,8 @@ const vp9_tree_index vp9_partition_tree[6] = {
 };

 struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+
+struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];

 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

@@ -285,15 +149,20 @@ static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
  { 238, 247 }
 };

-static const struct tx_probs default_tx_probs = {
-  { { 3, 136, 37 },
-    { 5, 52,  13 } },
-
-  { { 20, 152 },
-    { 15, 101 } },
-
-  { { 100 },
-    { 66  } }
+const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
+                                          [TX_SIZE_MAX_SB - 1] = {
+  { 3, 136, 37, },
+  { 5, 52, 13, },
+};
+const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
+                                          [TX_SIZE_MAX_SB - 2] = {
+  { 20, 152, },
+  { 15, 101, },
+};
+const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
+                                        [TX_SIZE_MAX_SB - 3] = {
+  { 100, },
+  { 66, },
 };

 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
@@ -312,96 +181,144 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
 void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
                                      unsigned int (*ct_16x16p)[2]) {
  ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
-  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
+  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] +
+                    tx_count_16x16p[TX_16X16];
  ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
  ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
 }

 void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
                                    unsigned int (*ct_8x8p)[2]) {
-  ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
-  ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
+  ct_8x8p[0][0] =   tx_count_8x8p[TX_4X4];
+  ct_8x8p[0][1] =   tx_count_8x8p[TX_8X8];
 }

-static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
+const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = {
  192, 128, 64
 };

-static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1]
-                                                  [VP9_SWITCHABLE_FILTERS-1] = {
+void vp9_init_mbmode_probs(VP9_COMMON *x) {
+  vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs,
+             sizeof(default_if_uv_probs));
+  vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs,
+             sizeof(default_kf_uv_probs));
+  vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs,
+             sizeof(default_if_y_probs));
+
+  vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
+             sizeof(vp9_switchable_interp_prob));
+
+  vpx_memcpy(x->fc.partition_prob, vp9_partition_probs,
+             sizeof(vp9_partition_probs));
+
+  vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p,
+             sizeof(default_intra_inter_p));
+  vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p,
+             sizeof(default_comp_inter_p));
+  vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p,
+             sizeof(default_comp_ref_p));
+  vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p,
+             sizeof(default_single_ref_p));
+  vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p,
+             sizeof(vp9_default_tx_probs_32x32p));
+  vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p,
+             sizeof(vp9_default_tx_probs_16x16p));
+  vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p,
+             sizeof(vp9_default_tx_probs_8x8p));
+  vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs,
+             sizeof(vp9_default_mbskip_probs));
+}
+
+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
+  -0, 2,
+  -1, -2
+};
+struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
+  EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};
+const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
+                                          [VP9_SWITCHABLE_FILTERS-1] = {
  { 235, 162, },
  { 36, 255, },
  { 34, 3, },
  { 149, 144, },
 };

-#if CONFIG_INTERINTRA
-static const vp9_prob default_interintra_prob[BLOCK_SIZE_TYPES] = {
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
-};
-#if CONFIG_MASKED_INTERINTRA
-static const vp9_prob default_masked_interintra_prob[BLOCK_SIZE_TYPES] = {
-// 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
-};
-#endif
-#endif
-
-#if CONFIG_MASKED_INTERINTER
-static const vp9_prob default_masked_interinter_prob[BLOCK_SIZE_TYPES] = {
-    192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
-};
-#endif
-
-void vp9_init_mbmode_probs(VP9_COMMON *cm) {
-  vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs);
-  vp9_copy(cm->fc.y_mode_prob, default_if_y_probs);
-  vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob);
-  vp9_copy(cm->fc.partition_prob, default_partition_probs);
-  vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p);
-  vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p);
-  vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p);
-  vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
-  cm->fc.tx_probs = default_tx_probs;
-  vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
-#if CONFIG_INTERINTRA
-  vp9_copy(cm->fc.interintra_prob, default_interintra_prob);
-#if CONFIG_MASKED_INTERINTRA
-  vp9_copy(cm->fc.masked_interintra_prob, default_masked_interintra_prob);
-#endif
-#endif
-#if CONFIG_FILTERINTRA
-  vp9_copy(cm->fc.filterintra_prob, vp9_default_filterintra_prob);
-#endif
-#if CONFIG_MASKED_INTERINTER
-  vp9_copy(cm->fc.masked_compound_prob, default_masked_interinter_prob);
-#endif
-}
-
-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
-  -EIGHTTAP, 2,
-  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
-};
-struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+// Indicates if the filter is interpolating or non-interpolating
+const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 1, 1, 1, -1};

 void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
                       vp9_switchable_interp_tree);
  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree_offset(vp9_inter_mode_encodings,
-                              vp9_inter_mode_tree, NEARESTMV);
+
+  vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
+                              vp9_sb_mv_ref_tree, NEARESTMV);
 }

-#define COUNT_SAT 20
-#define MAX_UPDATE_FACTOR 128
-
-static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) {
-  return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+void vp9_init_mode_contexts(VP9_COMMON *pc) {
+  vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts));
+  vpx_memcpy(pc->fc.inter_mode_probs,
+             vp9_default_inter_mode_probs,
+             sizeof(vp9_default_inter_mode_probs));
 }

-static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
-  return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+void vp9_accum_mv_refs(VP9_COMMON *pc,
+                       MB_PREDICTION_MODE m,
+                       const int context) {
+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
+      pc->fc.inter_mode_counts;
+
+  if (m == ZEROMV) {
+    ++inter_mode_counts[context][0][0];
+  } else {
+    ++inter_mode_counts[context][0][1];
+    if (m == NEARESTMV) {
+      ++inter_mode_counts[context][1][0];
+    } else {
+      ++inter_mode_counts[context][1][1];
+      if (m == NEARMV) {
+        ++inter_mode_counts[context][2][0];
+      } else {
+        ++inter_mode_counts[context][2][1];
+      }
+    }
+  }
+}
+
+#define MVREF_COUNT_SAT 20
+#define MVREF_MAX_UPDATE_FACTOR 128
+void vp9_adapt_mode_context(VP9_COMMON *pc) {
+  int i, j;
+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
+      pc->fc.inter_mode_counts;
+  vp9_prob (*mode_context)[VP9_INTER_MODES - 1] = pc->fc.inter_mode_probs;
+
+  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
+    for (i = 0; i < VP9_INTER_MODES - 1; i++) {
+      int count = inter_mode_counts[j][i][0] + inter_mode_counts[j][i][1];
+      int factor;
+      count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
+      factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
+      mode_context[j][i] = weighted_prob(
+          pc->fc.pre_inter_mode_probs[j][i],
+          get_binary_prob(inter_mode_counts[j][i][0],
+                          inter_mode_counts[j][i][1]),
+          factor);
+    }
+  }
+}
+
+#define MODE_COUNT_SAT 20
+#define MODE_MAX_UPDATE_FACTOR 128
+static int update_mode_ct(vp9_prob pre_prob, vp9_prob prob,
+                          unsigned int branch_ct[2]) {
+  int factor, count = branch_ct[0] + branch_ct[1];
+  count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+  factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+  return weighted_prob(pre_prob, prob, factor);
 }

 static void update_mode_probs(int n_modes,
@@ -416,170 +333,189 @@ static void update_mode_probs(int n_modes,
  assert(n_modes - 1 < MAX_PROBS);
  vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
  for (t = 0; t < n_modes - 1; ++t)
-    dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+    dst_probs[t] = update_mode_ct(pre_probs[t], probs[t], branch_ct[t]);
 }

+static int update_mode_ct2(vp9_prob pre_prob, unsigned int branch_ct[2]) {
+  return update_mode_ct(pre_prob, get_binary_prob(branch_ct[0],
+                                                  branch_ct[1]), branch_ct);
+}
+
+// #define MODE_COUNT_TESTING
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
  int i, j;
  FRAME_CONTEXT *fc = &cm->fc;
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-  FRAME_COUNTS *counts = &cm->counts;
+#ifdef MODE_COUNT_TESTING
+  int t;
+
+  printf("static const unsigned int\nymode_counts"
+         "[VP9_INTRA_MODES] = {\n");
+  for (t = 0; t < VP9_INTRA_MODES; ++t)
+    printf("%d, ", fc->ymode_counts[t]);
+  printf("};\n");
+  printf("static const unsigned int\nuv_mode_counts"
+         "[VP9_INTRA_MODES] [VP9_INTRA_MODES] = {\n");
+  for (i = 0; i < VP9_INTRA_MODES; ++i) {
+    printf("  {");
+    for (t = 0; t < VP9_INTRA_MODES; ++t)
+      printf("%d, ", fc->uv_mode_counts[i][t]);
+    printf("},\n");
+  }
+  printf("};\n");
+  printf("static const unsigned int\nbmode_counts"
+         "[VP9_NKF_BINTRAMODES] = {\n");
+  for (t = 0; t < VP9_NKF_BINTRAMODES; ++t)
+    printf("%d, ", fc->bmode_counts[t]);
+  printf("};\n");
+  printf("static const unsigned int\ni8x8_mode_counts"
+         "[VP9_I8X8_MODES] = {\n");
+  for (t = 0; t < VP9_I8X8_MODES; ++t)
+    printf("%d, ", fc->i8x8_mode_counts[t]);
+  printf("};\n");
+  printf("static const unsigned int\nmbsplit_counts"
+         "[VP9_NUMMBSPLITS] = {\n");
+  for (t = 0; t < VP9_NUMMBSPLITS; ++t)
+    printf("%d, ", fc->mbsplit_counts[t]);
+  printf("};\n");
+#endif

  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
-                                         counts->intra_inter[i]);
+    fc->intra_inter_prob[i] = update_mode_ct2(fc->pre_intra_inter_prob[i],
+                                              fc->intra_inter_count[i]);
  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
-                                        counts->comp_inter[i]);
+    fc->comp_inter_prob[i] = update_mode_ct2(fc->pre_comp_inter_prob[i],
+                                             fc->comp_inter_count[i]);
  for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
-                                      counts->comp_ref[i]);
+    fc->comp_ref_prob[i] = update_mode_ct2(fc->pre_comp_ref_prob[i],
+                                           fc->comp_ref_count[i]);
  for (i = 0; i < REF_CONTEXTS; i++)
    for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
-                                             counts->single_ref[i][j]);
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
-                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
-                      fc->inter_mode_probs[i], NEARESTMV);
+      fc->single_ref_prob[i][j] = update_mode_ct2(fc->pre_single_ref_prob[i][j],
+                                                  fc->single_ref_count[i][j]);

  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
-                      counts->y_mode[i], pre_fc->y_mode_prob[i],
+                      fc->y_mode_counts[i], fc->pre_y_mode_prob[i],
                      fc->y_mode_prob[i], 0);

  for (i = 0; i < VP9_INTRA_MODES; ++i)
    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
-                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
+                      fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i],
                      fc->uv_mode_prob[i], 0);

  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
    update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
-                      counts->partition[i],
-                      pre_fc->partition_prob[INTER_FRAME][i],
+                      fc->partition_counts[i], fc->pre_partition_prob[i],
                      fc->partition_prob[INTER_FRAME][i], 0);

  if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
      update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
-                        counts->switchable_interp[i],
-                        pre_fc->switchable_interp_prob[i],
+                        fc->switchable_interp_count[i],
+                        fc->pre_switchable_interp_prob[i],
                        fc->switchable_interp_prob[i], 0);
+    }
  }
-
-  if (cm->tx_mode == TX_MODE_SELECT) {
+  if (cm->txfm_mode == TX_MODE_SELECT) {
    int j;
-    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
-
+    unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];
+    unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];
+    unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];
    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
-                                             branch_ct_8x8p[j]);
-
-      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
-                                       branch_ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
-                                               branch_ct_16x16p[j]);
-
-      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
-                                       branch_ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
-                                               branch_ct_32x32p[j]);
-    }
-  }
-
-  for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-    fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
-                                     counts->mbskip[i]);
-
-#if CONFIG_INTERINTRA
-  if (cm->use_interintra) {
-    for (i = 0; i < BLOCK_SIZE_TYPES; ++i) {
-      if (is_interintra_allowed(i))
-        fc->interintra_prob[i] = update_ct2(pre_fc->interintra_prob[i],
-                                            counts->interintra[i]);
-    }
-#if CONFIG_MASKED_INTERINTRA
-    if (cm->use_masked_interintra) {
-      for (i = 0; i < BLOCK_SIZE_TYPES; ++i) {
-        if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
-          fc->masked_interintra_prob[i] = update_ct2(
-                                          pre_fc->masked_interintra_prob[i],
-                                          counts->masked_interintra[i]);
+      tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],
+                                     branch_ct_8x8p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
+        int factor;
+        int count = branch_ct_8x8p[j][0] + branch_ct_8x8p[j][1];
+        vp9_prob prob = get_binary_prob(branch_ct_8x8p[j][0],
+                                        branch_ct_8x8p[j][1]);
+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+        cm->fc.tx_probs_8x8p[i][j] = weighted_prob(
+            cm->fc.pre_tx_probs_8x8p[i][j], prob, factor);
      }
    }
-#endif
-  }
-#endif
-#if CONFIG_FILTERINTRA
-  for (i = 0; i < TX_SIZES; ++i)
-    for (j = 0; j < VP9_INTRA_MODES; ++j)
-      fc->filterintra_prob[i][j] = update_ct2(pre_fc->filterintra_prob[i][j],
-                                              counts->filterintra[i][j]);
-#endif
-#if CONFIG_MASKED_INTERINTER
-  if (cm->use_masked_compound) {
-    for (i = 0; i < BLOCK_SIZE_TYPES; ++i) {
-      if (get_mask_bits(i))
-        fc->masked_compound_prob[i] = update_ct2
-                                      (pre_fc->masked_compound_prob[i],
-                                       counts->masked_compound[i]);
+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+      tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],
+                                       branch_ct_16x16p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
+        int factor;
+        int count = branch_ct_16x16p[j][0] + branch_ct_16x16p[j][1];
+        vp9_prob prob = get_binary_prob(branch_ct_16x16p[j][0],
+                                        branch_ct_16x16p[j][1]);
+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+        cm->fc.tx_probs_16x16p[i][j] = weighted_prob(
+            cm->fc.pre_tx_probs_16x16p[i][j], prob, factor);
+      }
+    }
+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+      tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],
+                                       branch_ct_32x32p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
+        int factor;
+        int count = branch_ct_32x32p[j][0] + branch_ct_32x32p[j][1];
+        vp9_prob prob = get_binary_prob(branch_ct_32x32p[j][0],
+                                        branch_ct_32x32p[j][1]);
+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+        cm->fc.tx_probs_32x32p[i][j] = weighted_prob(
+            cm->fc.pre_tx_probs_32x32p[i][j], prob, factor);
+      }
    }
  }
-#endif
+  for (i = 0; i < MBSKIP_CONTEXTS; ++i)
+    fc->mbskip_probs[i] = update_mode_ct2(fc->pre_mbskip_probs[i],
+                                          fc->mbskip_count[i]);
 }

-static void set_default_lf_deltas(struct loopfilter *lf) {
-  lf->mode_ref_delta_enabled = 1;
-  lf->mode_ref_delta_update = 1;
+static void set_default_lf_deltas(MACROBLOCKD *xd) {
+  xd->mode_ref_lf_delta_enabled = 1;
+  xd->mode_ref_lf_delta_update = 1;

-  lf->ref_deltas[INTRA_FRAME] = 1;
-  lf->ref_deltas[LAST_FRAME] = 0;
-  lf->ref_deltas[GOLDEN_FRAME] = -1;
-  lf->ref_deltas[ALTREF_FRAME] = -1;
+  xd->ref_lf_deltas[INTRA_FRAME] = 1;
+  xd->ref_lf_deltas[LAST_FRAME] = 0;
+  xd->ref_lf_deltas[GOLDEN_FRAME] = -1;
+  xd->ref_lf_deltas[ALTREF_FRAME] = -1;

-  lf->mode_deltas[0] = 0;
-  lf->mode_deltas[1] = 0;
+  xd->mode_lf_deltas[0] = 0;              // Zero
+  xd->mode_lf_deltas[1] = 0;               // New mv
 }

 void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
  // Reset the segment feature data to the default stats:
  // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &xd->lf;
-
  int i;
-  vp9_clearall_segfeatures(&xd->seg);
-  xd->seg.abs_delta = SEGMENT_DELTADATA;
+  vp9_clearall_segfeatures(xd);
+  xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
  if (cm->last_frame_seg_map)
    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));

  // Reset the mode ref deltas for loop filter
-  vp9_zero(lf->last_ref_deltas);
-  vp9_zero(lf->last_mode_deltas);
-  set_default_lf_deltas(lf);
-
-  // To force update of the sharpness
-  lf->last_sharpness_level = -1;
+  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));
+  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));
+  set_default_lf_deltas(xd);

  vp9_default_coef_probs(cm);
  vp9_init_mbmode_probs(cm);
+  vpx_memcpy(cm->kf_y_mode_prob, vp9_kf_default_bmode_probs,
+             sizeof(vp9_kf_default_bmode_probs));
  vp9_init_mv_probs(cm);
-  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);

-  if (cm->frame_type == KEY_FRAME ||
-      cm->error_resilient_mode || cm->reset_frame_context == 3) {
+  // To force update of the sharpness
+  cm->last_sharpness_level = -1;
+
+  vp9_init_mode_contexts(cm);
+
+  if ((cm->frame_type == KEY_FRAME) ||
+      cm->error_resilient_mode || (cm->reset_frame_context == 3)) {
    // Reset all frame contexts.
    for (i = 0; i < NUM_FRAME_CONTEXTS; ++i)
-      cm->frame_contexts[i] = cm->fc;
+      vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));
  } else if (cm->reset_frame_context == 2) {
    // Reset only the frame context specified in the frame header.
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,
+               sizeof(cm->fc));
  }

  vpx_memset(cm->prev_mip, 0,
@@ -593,7 +529,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
  vp9_update_mode_info_border(cm, cm->prev_mip);
  vp9_update_mode_info_in_image(cm, cm->prev_mi);

-  vp9_zero(cm->ref_frame_sign_bias);
+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));

  cm->frame_context_idx = 0;
 }
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -16,69 +16,81 @@

 #define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
+
 #define VP9_MODE_UPDATE_PROB  252
-#define VP9_SWITCHABLE_FILTERS 3   // number of switchable filters
-
-#if CONFIG_INTERINTRA
-#define VP9_UPD_INTERINTRA_PROB 248
-#define SEPARATE_INTERINTRA_UV  0
-#if CONFIG_MASKED_INTERINTRA
-#define VP9_UPD_MASKED_INTERINTRA_PROB 248
-#endif
-#endif
-
-#if CONFIG_MASKED_INTERINTER
-#define VP9_UPD_MASKED_COMPOUND_PROB 248
-#endif

 // #define MODE_STATS

-struct VP9Common;
+extern int vp9_mv_cont(const int_mv *l, const int_mv *a);

-struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
-};

-struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-};
-
-extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES]
-                                        [VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
+                                                [VP9_INTRA_MODES]
+                                                [VP9_INTRA_MODES - 1];

 extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index vp9_inter_mode_tree[];
+extern const vp9_tree_index  vp9_sb_mv_ref_tree[];

 extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+
+/* Inter mode values do not start at zero */
+
+extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];

 // probability models for partition information
-extern const vp9_tree_index vp9_partition_tree[];
+extern const vp9_tree_index  vp9_partition_tree[];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
+extern const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES]
+                                         [NUM_PARTITION_CONTEXTS]
+                                         [PARTITION_TYPES - 1];

-extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+void vp9_entropy_mode_init(void);

-extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init();
+struct VP9Common;

+/* sets up common features to forget past dependence */
 void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);

 void vp9_init_mbmode_probs(struct VP9Common *x);

+extern void vp9_init_mode_contexts(struct VP9Common *pc);
+
+extern void vp9_adapt_mode_context(struct VP9Common *pc);
+
+extern void vp9_accum_mv_refs(struct VP9Common *pc,
+                              MB_PREDICTION_MODE m,
+                              const int context);
+
 void vp9_adapt_mode_probs(struct VP9Common *);

-void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
-                                      unsigned int (*ct_32x32p)[2]);
-void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
-                                      unsigned int (*ct_16x16p)[2]);
-void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
-                                    unsigned int (*ct_8x8p)[2]);
+#define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */

+extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp
+                  [VP9_SWITCHABLE_FILTERS];
+
+extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];
+
+extern const  int vp9_is_interpolating_filter[SWITCHABLE + 1];
+
+extern const  vp9_tree_index vp9_switchable_interp_tree
+                  [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+
+extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+
+extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+                                                 [VP9_SWITCHABLE_FILTERS - 1];
+
+extern const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
+                                                 [TX_SIZE_MAX_SB - 1];
+extern const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
+                                                 [TX_SIZE_MAX_SB - 2];
+extern const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
+                                               [TX_SIZE_MAX_SB - 3];
+
+extern void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+                                             unsigned int (*ct_32x32p)[2]);
+extern void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+                                             unsigned int (*ct_16x16p)[2]);
+extern void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+                                           unsigned int (*ct_8x8p)[2]);
 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -12,11 +12,16 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"

+//#define MV_COUNT_TESTING
+
 #define MV_COUNT_SAT 20
 #define MV_MAX_UPDATE_FACTOR 128

 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH 8
+#define COMPANDED_MVREF_THRESH    8
+
+/* Smooth or bias the mv-counts before prob computation */
+/* #define SMOOTH_MV_COUNTS */

 const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
  -MV_JOINT_ZERO, 2,
@@ -51,7 +56,7 @@ const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
 };
 struct vp9_token vp9_mv_fp_encodings[4];

-static const nmv_context default_nmv_context = {
+const nmv_context vp9_default_nmv_context = {
  {32, 64, 96},
  {
    { /* vert component */
@@ -77,10 +82,21 @@ static const nmv_context default_nmv_context = {
  },
 };

+MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
+  if (mv->row == 0 && mv->col == 0)
+    return MV_JOINT_ZERO;
+  else if (mv->row == 0 && mv->col != 0)
+    return MV_JOINT_HNZVZ;
+  else if (mv->row != 0 && mv->col == 0)
+    return MV_JOINT_HZVNZ;
+  else
+    return MV_JOINT_HNZVNZ;
+}
+
 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)

 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
-  MV_CLASS_TYPE c = MV_CLASS_0;
+  MV_CLASS_TYPE c;
  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
@@ -107,6 +123,12 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
  return mv_class_base(c) + offset;
 }

+static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
+                                   int incr) {
+  assert (v != 0);
+  comp_counts->mvcount[MV_MAX + v] += incr;
+}
+
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                             int incr, int usehp) {
  int s, z, c, o, d, e, f;
@@ -149,6 +171,24 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
  }
 }

+#ifdef SMOOTH_MV_COUNTS
+static void smooth_counts(nmv_component_counts *mvcomp) {
+  static const int flen = 3;  // (filter_length + 1) / 2
+  static const int fval[] = {8, 3, 1};
+  static const int fvalbits = 4;
+  int i;
+  unsigned int smvcount[MV_VALS];
+  vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
+  smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
+  for (i = flen - 1; i <= MV_VALS - flen; ++i) {
+    int j, s = smvcount[i] * fval[0];
+    for (j = 1; j < flen; ++j)
+      s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
+    mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
+  }
+}
+#endif
+
 static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
  int v;
  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
@@ -158,19 +198,27 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
  }
 }

-void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+                       int usehp) {
  const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
-  ++counts->joints[j];
-
+  mvctx->joints[j]++;
+  usehp = usehp && vp9_use_mv_hp(ref);
  if (mv_joint_vertical(j))
-    ++counts->comps[0].mvcount[MV_MAX + mv->row];
+    inc_mv_component_count(mv->row, &mvctx->comps[0], 1);

  if (mv_joint_horizontal(j))
-    ++counts->comps[1].mvcount[MV_MAX + mv->col];
+    inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
 }

-static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
+  const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT);
+  if (count) {
+    const vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+    const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT;
+    *dest = weighted_prob(prep, newp, factor);
+  } else {
+    *dest = prep;
+  }
 }

 void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
@@ -178,61 +226,212 @@ void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
  counts_to_context(&nmv_count->comps[1], usehp);
 }

+void vp9_counts_to_nmv_context(
+    nmv_context_counts *nmv_count,
+    nmv_context *prob,
+    int usehp,
+    unsigned int (*branch_ct_joint)[2],
+    unsigned int (*branch_ct_sign)[2],
+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+    unsigned int (*branch_ct_fp)[4 - 1][2],
+    unsigned int (*branch_ct_class0_hp)[2],
+    unsigned int (*branch_ct_hp)[2]) {
+  int i, j, k;
+  vp9_counts_process(nmv_count, usehp);
+  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
+                                   prob->joints,
+                                   branch_ct_joint,
+                                   nmv_count->joints, 0);
+  for (i = 0; i < 2; ++i) {
+    const uint32_t s0 = nmv_count->comps[i].sign[0];
+    const uint32_t s1 = nmv_count->comps[i].sign[1];
+
+    prob->comps[i].sign = get_binary_prob(s0, s1);
+    branch_ct_sign[i][0] = s0;
+    branch_ct_sign[i][1] = s1;
+    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
+                                     prob->comps[i].classes,
+                                     branch_ct_classes[i],
+                                     nmv_count->comps[i].classes, 0);
+    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
+                                     prob->comps[i].class0,
+                                     branch_ct_class0[i],
+                                     nmv_count->comps[i].class0, 0);
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
+      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
+
+      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
+      branch_ct_bits[i][j][0] = b0;
+      branch_ct_bits[i][j][1] = b1;
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (k = 0; k < CLASS0_SIZE; ++k) {
+      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+                                       prob->comps[i].class0_fp[k],
+                                       branch_ct_class0_fp[i][k],
+                                       nmv_count->comps[i].class0_fp[k], 0);
+    }
+    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+                                     prob->comps[i].fp,
+                                     branch_ct_fp[i],
+                                     nmv_count->comps[i].fp, 0);
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
+      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
+      const uint32_t hp0 = nmv_count->comps[i].hp[0];
+      const uint32_t hp1 = nmv_count->comps[i].hp[1];
+
+      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
+      branch_ct_class0_hp[i][0] = c0_hp0;
+      branch_ct_class0_hp[i][1] = c0_hp1;
+
+      prob->comps[i].hp = get_binary_prob(hp0, hp1);
+      branch_ct_hp[i][0] = hp0;
+      branch_ct_hp[i][1] = hp1;
+    }
+  }
+}
+
 static unsigned int adapt_probs(unsigned int i,
                                vp9_tree tree,
                                vp9_prob this_probs[],
                                const vp9_prob last_probs[],
                                const unsigned int num_events[]) {
+  vp9_prob this_prob;

-
-  const unsigned int left = tree[i] <= 0
+  const uint32_t left = tree[i] <= 0
          ? num_events[-tree[i]]
          : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);

-  const unsigned int right = tree[i + 1] <= 0
+  const uint32_t right = tree[i + 1] <= 0
          ? num_events[-tree[i + 1]]
          : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-  const unsigned int ct[2] = { left, right };
-  this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
+
+  uint32_t weight = left + right;
+  if (weight) {
+    this_prob = get_binary_prob(left, right);
+    weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight;
+    this_prob = weighted_prob(last_probs[i >> 1], this_prob,
+                              MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT);
+  } else {
+    this_prob = last_probs[i >> 1];
+  }
+  this_probs[i >> 1] = this_prob;
  return left + right;
 }


-void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
  int i, j;
+#ifdef MV_COUNT_TESTING
+  printf("joints count: ");
+  for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
+  printf("\n"); fflush(stdout);
+  printf("signs count:\n");
+  for (i = 0; i < 2; ++i)
+    printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
+  printf("\n"); fflush(stdout);
+  printf("classes count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < MV_CLASSES; ++j)
+      printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
+    printf("\n"); fflush(stdout);
+  }
+  printf("class0 count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
+    printf("\n"); fflush(stdout);
+  }
+  printf("bits count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
+                       cm->fc.NMVcount.comps[i].bits[j][1]);
+    printf("\n"); fflush(stdout);
+  }
+  printf("class0_fp count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      printf("{");
+      for (k = 0; k < 4; ++k)
+        printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
+      printf("}, ");
+    }
+    printf("\n"); fflush(stdout);
+  }
+  printf("fp count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 4; ++j)
+      printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
+    printf("\n"); fflush(stdout);
+  }
+  if (usehp) {
+    printf("class0_hp count:\n");
+    for (i = 0; i < 2; ++i)
+      printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
+                       cm->fc.NMVcount.comps[i].class0_hp[1]);
+    printf("\n"); fflush(stdout);
+    printf("hp count:\n");
+    for (i = 0; i < 2; ++i)
+      printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
+                       cm->fc.NMVcount.comps[i].hp[1]);
+    printf("\n"); fflush(stdout);
+  }
+#endif
+#ifdef SMOOTH_MV_COUNTS
+  smooth_counts(&cm->fc.NMVcount.comps[0]);
+  smooth_counts(&cm->fc.NMVcount.comps[1]);
+#endif
+  vp9_counts_process(&cm->fc.NMVcount, usehp);

-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-
-  nmv_context *ctx = &cm->fc.nmvc;
-  nmv_context *pre_ctx = &pre_fc->nmvc;
-  nmv_context_counts *cts = &cm->counts.mv;
-
-  vp9_counts_process(cts, allow_hp);
-
-  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+  adapt_probs(0, vp9_mv_joint_tree,
+              cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints,
+              cm->fc.NMVcount.joints);

  for (i = 0; i < 2; ++i) {
-    ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
-    adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
-                pre_ctx->comps[i].classes, cts->comps[i].classes);
-    adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
-                pre_ctx->comps[i].class0, cts->comps[i].class0);
-
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-        ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
-                                           cts->comps[i].bits[j]);
-
-    for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
-                  pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
-
-    adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
-                cts->comps[i].fp);
-
-    if (allow_hp) {
-      ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
-                                           cts->comps[i].class0_hp);
-      ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
+    adapt_prob(&cm->fc.nmvc.comps[i].sign,
+               cm->fc.pre_nmvc.comps[i].sign,
+               cm->fc.NMVcount.comps[i].sign);
+    adapt_probs(0, vp9_mv_class_tree,
+                cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes,
+                cm->fc.NMVcount.comps[i].classes);
+    adapt_probs(0, vp9_mv_class0_tree,
+                cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0,
+                cm->fc.NMVcount.comps[i].class0);
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
+                 cm->fc.pre_nmvc.comps[i].bits[j],
+                 cm->fc.NMVcount.comps[i].bits[j]);
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      adapt_probs(0, vp9_mv_fp_tree,
+                  cm->fc.nmvc.comps[i].class0_fp[j],
+                  cm->fc.pre_nmvc.comps[i].class0_fp[j],
+                  cm->fc.NMVcount.comps[i].class0_fp[j]);
+    }
+    adapt_probs(0, vp9_mv_fp_tree,
+                cm->fc.nmvc.comps[i].fp,
+                cm->fc.pre_nmvc.comps[i].fp,
+                cm->fc.NMVcount.comps[i].fp);
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
+                 cm->fc.pre_nmvc.comps[i].class0_hp,
+                 cm->fc.NMVcount.comps[i].class0_hp);
+      adapt_prob(&cm->fc.nmvc.comps[i].hp,
+                 cm->fc.pre_nmvc.comps[i].hp,
+                 cm->fc.NMVcount.comps[i].hp);
    }
  }
 }
@@ -245,5 +444,5 @@ void vp9_entropy_mv_init() {
 }

 void vp9_init_mv_probs(VP9_COMMON *cm) {
-  cm->fc.nmvc = default_nmv_context;
+  vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));
 }
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -26,6 +26,10 @@ int vp9_use_mv_hp(const MV *ref);

 #define VP9_NMV_UPDATE_PROB  252

+//#define MV_GROUP_UPDATE
+
+#define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */
+
 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS     4
 typedef enum {
@@ -95,14 +99,7 @@ typedef struct {
  nmv_component comps[2];
 } nmv_context;

-static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
-  if (mv->row == 0) {
-    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
-  } else {
-    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
-  }
-}
-
+MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv);
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
 int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);

@@ -124,8 +121,22 @@ typedef struct {
  nmv_component_counts comps[2];
 } nmv_context_counts;

-void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
-
+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+                       int usehp);
+extern const nmv_context vp9_default_nmv_context;
+void vp9_counts_to_nmv_context(
+    nmv_context_counts *NMVcount,
+    nmv_context *prob,
+    int usehp,
+    unsigned int (*branch_ct_joint)[2],
+    unsigned int (*branch_ct_sign)[2],
+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+    unsigned int (*branch_ct_fp)[4 - 1][2],
+    unsigned int (*branch_ct_class0_hp)[2],
+    unsigned int (*branch_ct_hp)[2]);
 void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);

 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -14,28 +14,24 @@
 #include "./vpx_config.h"

 #define LOG2_MI_SIZE 3
-#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE)  // 64 = 2^6

-#define MAX_BLOCK_SIZE (1 << 6)  // max block size in pixel
-#define MI_SIZE (1 << LOG2_MI_SIZE)  // pixels per mi-unit
-#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE)  // mi-units per max block
-
-#define MI_MASK (MI_BLOCK_SIZE - 1)
+#define MI_SIZE (1 << LOG2_MI_SIZE)
+#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1)

 typedef enum BLOCK_SIZE_TYPE {
-  BLOCK_4X4,
-  BLOCK_4X8,
-  BLOCK_8X4,
-  BLOCK_8X8,
-  BLOCK_8X16,
-  BLOCK_16X8,
-  BLOCK_16X16,
-  BLOCK_16X32,
-  BLOCK_32X16,
-  BLOCK_32X32,
-  BLOCK_32X64,
-  BLOCK_64X32,
-  BLOCK_64X64,
+  BLOCK_SIZE_AB4X4,
+  BLOCK_SIZE_SB4X8,
+  BLOCK_SIZE_SB8X4,
+  BLOCK_SIZE_SB8X8,
+  BLOCK_SIZE_SB8X16,
+  BLOCK_SIZE_SB16X8,
+  BLOCK_SIZE_MB16X16,
+  BLOCK_SIZE_SB16X32,
+  BLOCK_SIZE_SB32X16,
+  BLOCK_SIZE_SB32X32,
+  BLOCK_SIZE_SB32X64,
+  BLOCK_SIZE_SB64X32,
+  BLOCK_SIZE_SB64X64,
  BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;

@@ -44,34 +40,10 @@ typedef enum PARTITION_TYPE {
  PARTITION_HORZ,
  PARTITION_VERT,
  PARTITION_SPLIT,
-  PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
+  PARTITION_TYPES
 } PARTITION_TYPE;

 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)

-typedef enum {
-  TX_4X4 = 0,                      // 4x4 dct transform
-  TX_8X8 = 1,                      // 8x8 dct transform
-  TX_16X16 = 2,                    // 16x16 dct transform
-  TX_32X32 = 3,                    // 32x32 dct transform
-  TX_SIZES
-} TX_SIZE;
-
-typedef enum {
-  ONLY_4X4            = 0,
-  ALLOW_8X8           = 1,
-  ALLOW_16X16         = 2,
-  ALLOW_32X32         = 3,
-  TX_MODE_SELECT      = 4,
-  TX_MODES            = 5,
-} TX_MODE;
-
-typedef enum {
-  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
-  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
-  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
-  ADST_ADST = 3                       // ADST in both directions
-} TX_TYPE;
-
 #endif  // VP9_COMMON_VP9_ENUMS_H_
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c
@@ -8,10 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_mem/vpx_mem.h"
-
-#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_extend.h"
+#include "vpx_mem/vpx_mem.h"

 static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
                                  uint8_t *dst, int dst_pitch,
@@ -109,14 +107,14 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
  const int src_y_offset = srcy * src->y_stride + srcx;
  const int dst_y_offset = srcy * dst->y_stride + srcx;

-  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
-  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
-  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
-  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+  const int et_uv = (et_y + 1) >> 1;
+  const int el_uv = (el_y + 1) >> 1;
+  const int eb_uv = (eb_y + 1) >> 1;
+  const int er_uv = (er_y + 1) >> 1;
  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
-  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+  const int srch_uv = (srch + 1) >> 1;
+  const int srcw_uv = (srcw + 1) >> 1;

  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
                        dst->y_buffer + dst_y_offset, dst->y_stride,
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -14,13 +14,12 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_sadmxn.h"

-static void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
-  if (!use_hp) {
-    if (mv->row & 1)
-      mv->row += (mv->row > 0 ? -1 : 1);
-    if (mv->col & 1)
-      mv->col += (mv->col > 0 ? -1 : 1);
+static void lower_mv_precision(int_mv *mv, int usehp) {
+  if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
+    if (mv->as_mv.row & 1)
+      mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
+    if (mv->as_mv.col & 1)
+      mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
  }
 }

@@ -32,8 +31,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
  int i;
  // Make sure all the candidates are properly clamped etc
  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    lower_mv_precision(&mvlist[i].as_mv, xd->allow_high_precision_mv);
-    clamp_mv2(&mvlist[i].as_mv, xd);
+    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
+    clamp_mv2(&mvlist[i], xd);
  }
  *nearest = mvlist[0];
  *near = mvlist[1];
@@ -42,8 +41,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   int_mv *dst_nearest,
                                   int_mv *dst_near,
-                                   int block_idx, int ref_idx,
-                                   int mi_row, int mi_col) {
+                                   int block_idx, int ref_idx) {
  int_mv dst_list[MAX_MV_REF_CANDIDATES];
  int_mv mv_list[MAX_MV_REF_CANDIDATES];
  MODE_INFO *mi = xd->mode_info_context;
@@ -55,8 +53,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
  vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
                       xd->prev_mode_info_context,
                       mbmi->ref_frame[ref_idx],
-                       mv_list, cm->ref_frame_sign_bias, block_idx,
-                       mi_row, mi_col);
+                       mv_list, cm->ref_frame_sign_bias, block_idx);

  dst_list[1].as_int = 0;
  if (block_idx == 0) {
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -28,20 +28,53 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                           int_mv *nearest,
                           int_mv *near);

+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
+                    int_mv *mvp, const int *ref_frame_sign_bias) {
+  MV xmv = mvp->as_mv;
+
+  if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
+    xmv.row *= -1;
+    xmv.col *= -1;
+  }
+
+  mvp->as_mv = xmv;
+}
+
 // TODO(jingning): this mv clamping function should be block size dependent.
-static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+static void clamp_mv(int_mv *mv,
+                     int mb_to_left_edge,
+                     int mb_to_right_edge,
+                     int mb_to_top_edge,
+                     int mb_to_bottom_edge) {
+  mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge);
+  mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge);
+}
+
+static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+  int_mv tmp_mv;
+  tmp_mv.as_int = mv->as_int;
+  clamp_mv(mv,
+           xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+  return tmp_mv.as_int != mv->as_int;
+}
+
+static int check_mv_bounds(int_mv *mv,
+                           int mb_to_left_edge, int mb_to_right_edge,
+                           int mb_to_top_edge, int mb_to_bottom_edge) {
+  return mv->as_mv.col < mb_to_left_edge ||
+         mv->as_mv.col > mb_to_right_edge ||
+         mv->as_mv.row < mb_to_top_edge ||
+         mv->as_mv.row > mb_to_bottom_edge;
 }

 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
                                   MACROBLOCKD *xd,
                                   int_mv *dst_nearest,
                                   int_mv *dst_near,
-                                   int block_idx, int ref_idx,
-                                   int mi_row, int mi_col);
+                                   int block_idx, int ref_idx);

 static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
@@ -50,16 +83,16 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
    /* On L edge, get from MB to left of us */
    --cur_mb;

-    if (is_inter_block(&cur_mb->mbmi)) {
+    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
      return DC_PRED;
-    } else if (cur_mb->mbmi.sb_type < BLOCK_8X8) {
-      return (cur_mb->bmi + 1 + b)->as_mode;
+    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+      return ((cur_mb->bmi + 1 + b)->as_mode.first);
    } else {
      return cur_mb->mbmi.mode;
    }
  }
  assert(b == 1 || b == 3);
-  return (cur_mb->bmi + b - 1)->as_mode;
+  return (cur_mb->bmi + b - 1)->as_mode.first;
 }

 static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
@@ -68,16 +101,16 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
    /* On top edge, get from MB above us */
    cur_mb -= mi_stride;

-    if (is_inter_block(&cur_mb->mbmi)) {
+    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
      return DC_PRED;
-    } else if (cur_mb->mbmi.sb_type < BLOCK_8X8) {
-      return (cur_mb->bmi + 2 + b)->as_mode;
+    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+      return ((cur_mb->bmi + 2 + b)->as_mode.first);
    } else {
      return cur_mb->mbmi.mode;
    }
  }

-  return (cur_mb->bmi + b - 2)->as_mode;
+  return (cur_mb->bmi + b - 2)->as_mode.first;
 }

 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -124,7 +124,9 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

  // Rows
  for (i = 0; i < 4; ++i) {
-    vp9_idct4_1d(input, outptr);
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = input[j];
+    vp9_idct4_1d(temp_in, outptr);
    input += 4;
    outptr += 4;
  }
@@ -156,6 +158,23 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
  }
 }

+void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
+                            uint8_t *dst_ptr, int pitch, int stride) {
+  int a1;
+  int r, c;
+  int16_t out = dct_const_round_shift(input_dc * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++)
+      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
+
+    dst_ptr += stride;
+    pred_ptr += pitch;
+  }
+}
+
 static void idct8_1d(int16_t *input, int16_t *output) {
  int16_t step1[8], step2[8];
  int temp1, temp2;
@@ -225,19 +244,6 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
  }
 }

-void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
-  int i, j;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
-  }
-}
-
 static void iadst4_1d(int16_t *input, int16_t *output) {
  int s0, s1, s2, s3, s4, s5, s6, s7;

@@ -422,11 +428,12 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,

 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
                                int dest_stride) {
-  int16_t out[8 * 8] = { 0 };
+  int16_t out[8 * 8];
  int16_t *outptr = out;
  int i, j;
  int16_t temp_in[8], temp_out[8];

+  vpx_memset(out, 0, sizeof(out));
  // First transform rows
  // only first 4 row has non-zero coefs
  for (i = 0; i < 4; ++i) {
@@ -446,6 +453,12 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
  }
 }

+void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  output[0] = ROUND_POWER_OF_TWO(out, 5);
+}
+
 static void idct16_1d(int16_t *input, int16_t *output) {
  int16_t step1[16], step2[16];
  int temp1, temp2;
@@ -522,7 +535,6 @@ static void idct16_1d(int16_t *input, int16_t *output) {
  step1[14] = -step2[14] + step2[15];
  step1[15] = step2[14] + step2[15];

-  // stage 4
  temp1 = (step1[0] + step1[1]) * cospi_16_64;
  temp2 = (step1[0] - step1[1]) * cospi_16_64;
  step2[0] = dct_const_round_shift(temp1);
@@ -840,13 +852,15 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,

 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
                                  int dest_stride) {
-  int16_t out[16 * 16] = { 0 };
+  int16_t out[16 * 16];
  int16_t *outptr = out;
  int i, j;
  int16_t temp_in[16], temp_out[16];

-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
  for (i = 0; i < 4; ++i) {
    idct16_1d(input, outptr);
    input += 16;
@@ -864,18 +878,10 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
  }
 }

-void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
-  int i, j;
-  int a1;
+void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
-  }
+  output[0] = ROUND_POWER_OF_TWO(out, 6);
 }

 static void idct32_1d(int16_t *input, int16_t *output) {
@@ -1274,3 +1280,31 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
  out = dct_const_round_shift(out * cospi_16_64);
  output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
+
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  int16_t out[32 * 32];
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[32], temp_out[32];
+
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
+  for (i = 0; i < 4; ++i) {
+    idct32_1d(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
+  }
+}
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -22,15 +22,10 @@
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))

-#define WHT_UPSCALE_FACTOR 2
-
 #define pair_set_epi16(a, b) \
  _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))

-// Constants:
-//  for (int i = 1; i< 32; ++i)
-//    printf("static const int cospi_%d_64 = %.0f;\n", i,
-//           round(16384 * cos(i*M_PI/64)));
+// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
 static const int cospi_1_64  = 16364;
 static const int cospi_2_64  = 16305;
--- a/vp9/common/vp9_implicit_segmentation.c
+++ b/vp9/common/vp9_implicit_segmentation.c
@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_onyxc_int.h"
+
+#define MAX_REGIONS 24000
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define min_mbs_in_region 3
+
+// this linked list structure holds equivalences for connected
+// component labeling
+struct list_el {
+  int label;
+  int seg_value;
+  int count;
+  struct list_el *next;
+};
+typedef struct list_el item;
+
+// connected colorsegments
+typedef struct {
+  int min_x;
+  int min_y;
+  int max_x;
+  int max_y;
+  int64_t sum_x;
+  int64_t sum_y;
+  int pixels;
+  int seg_value;
+  int label;
+} segment_info;
+
+
+typedef enum {
+  SEGMENT_MODE,
+  SEGMENT_MV,
+  SEGMENT_REFFRAME,
+  SEGMENT_SKIPPED
+} SEGMENT_TYPE;
+
+
+// this merges the two equivalence lists and
+// then makes sure that every label points to the same
+// equivalence list
+void merge(item *labels, int u, int v) {
+  item *a = labels[u].next;
+  item *b = labels[v].next;
+  item c;
+  item *it = &c;
+  int count;
+
+  // check if they are already merged
+  if (u == v || a == b)
+    return;
+
+  count = a->count + b->count;
+
+  // merge 2 sorted linked lists.
+  while (a != NULL && b != NULL) {
+    if (a->label < b->label) {
+      it->next = a;
+      a = a->next;
+    } else {
+      it->next = b;
+      b = b->next;
+    }
+
+    it = it->next;
+  }
+
+  if (a == NULL)
+    it->next = b;
+  else
+    it->next = a;
+
+  it = c.next;
+
+  // make sure every equivalence in the linked list points to this new ll
+  while (it != NULL) {
+    labels[it->label].next = c.next;
+    it = it->next;
+  }
+  c.next->count = count;
+
+}
+
+void segment_via_mode_info(VP9_COMMON *oci, int how) {
+  MODE_INFO *mi = oci->mi;
+  int i, j;
+  int mb_index = 0;
+
+  int label = 1;
+  int pitch = oci->mb_cols;
+
+  // holds linked list equivalences
+  // the max should probably be allocated at a higher level in oci
+  item equivalences[MAX_REGIONS];
+  int eq_ptr = 0;
+  item labels[MAX_REGIONS];
+  segment_info segments[MAX_REGIONS];
+  int label_count = 1;
+  int labeling[400 * 300];
+  int *lp = labeling;
+
+  label_count = 1;
+  memset(labels, 0, sizeof(labels));
+  memset(segments, 0, sizeof(segments));
+
+  /* Go through each macroblock first pass labelling */
+  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+    for (j = 0; j < oci->mb_cols; j++) {
+      // int above seg_value, left seg_value, this seg_value...
+      int a = -1, l = -1, n = -1;
+
+      // above label, left label
+      int al = -1, ll = -1;
+      if (i) {
+        al = lp[j - pitch];
+        a = labels[al].next->seg_value;
+      }
+      if (j) {
+        ll = lp[j - 1];
+        l = labels[ll].next->seg_value;
+      }
+
+      // what setting are we going to do the implicit segmentation on
+      switch (how) {
+        case SEGMENT_MODE:
+          n = mi[mb_index].mbmi.mode;
+          break;
+        case SEGMENT_MV:
+          n = mi[mb_index].mbmi.mv[0].as_int;
+          if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME)
+            n = -9999999;
+          break;
+        case SEGMENT_REFFRAME:
+          n = mi[mb_index].mbmi.ref_frame[0];
+          break;
+        case SEGMENT_SKIPPED:
+          n = mi[mb_index].mbmi.mb_skip_coeff;
+          break;
+      }
+
+      // above and left both have the same seg_value
+      if (n == a && n == l) {
+        // pick the lowest label
+        lp[j] = (al < ll ? al : ll);
+        labels[lp[j]].next->count++;
+
+        // merge the above and left equivalencies
+        merge(labels, al, ll);
+      }
+      // this matches above seg_value
+      else if (n == a) {
+        // give it the same label as above
+        lp[j] = al;
+        labels[al].next->count++;
+      }
+      // this matches left seg_value
+      else if (n == l) {
+        // give it the same label as above
+        lp[j] = ll;
+        labels[ll].next->count++;
+      } else {
+        // new label doesn't match either
+        item *e = &labels[label];
+        item *nl = &equivalences[eq_ptr++];
+        lp[j] = label;
+        nl->label = label;
+        nl->next = 0;
+        nl->seg_value = n;
+        nl->count = 1;
+        e->next = nl;
+        label++;
+      }
+      mb_index++;
+    }
+    mb_index++;
+  }
+  lp = labeling;
+
+  // give new labels to regions
+  for (i = 1; i < label; i++)
+    if (labels[i].next->count > min_mbs_in_region &&
+        labels[labels[i].next->label].label == 0) {
+      segment_info *cs = &segments[label_count];
+      cs->label = label_count;
+      labels[labels[i].next->label].label = label_count++;
+      labels[labels[i].next->label].seg_value = labels[i].next->seg_value;
+      cs->seg_value = labels[labels[i].next->label].seg_value;
+      cs->min_x = oci->mb_cols;
+      cs->min_y = oci->mb_rows;
+      cs->max_x = 0;
+      cs->max_y = 0;
+      cs->sum_x = 0;
+      cs->sum_y = 0;
+      cs->pixels = 0;
+    }
+
+  lp = labeling;
+
+  // this is just to gather stats...
+  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+    for (j = 0; j < oci->mb_cols; j++) {
+      const int old_lab = labels[lp[j]].next->label;
+      const int lab = labels[old_lab].label;
+      segment_info *cs = &segments[lab];
+
+      cs->min_x = MIN(cs->min_x, j);
+      cs->max_x = MAX(cs->max_x, j);
+      cs->min_y = MIN(cs->min_y, i);
+      cs->max_y = MAX(cs->max_y, i);
+      cs->sum_x += j;
+      cs->sum_y += i;
+      cs->pixels++;
+
+      lp[j] = lab;
+      mb_index++;
+    }
+    mb_index++;
+  }
+
+  {
+    lp = labeling;
+    printf("labelling \n");
+    mb_index = 0;
+    for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+      for (j = 0; j < oci->mb_cols; j++) {
+        printf("%4d", lp[j]);
+      }
+      printf("            ");
+      for (j = 0; j < oci->mb_cols; j++, mb_index++) {
+        // printf("%3d",mi[mb_index].mbmi.mode );
+        printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
+            mi[mb_index].mbmi.mv[0].as_mv.col);
+      }
+      printf("\n");
+      ++mb_index;
+    }
+    printf("\n");
+  }
+}
+
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -16,12 +16,6 @@

 #include "vp9/common/vp9_seg_common.h"

-struct loop_filter_info {
-  const uint8_t *mblim;
-  const uint8_t *lim;
-  const uint8_t *hev_thr;
-};
-
 static void lf_init_lut(loop_filter_info_n *lfi) {
  lfi->mode_lf_lut[DC_PRED] = 0;
  lfi->mode_lf_lut[D45_PRED] = 0;
@@ -39,13 +33,18 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
  lfi->mode_lf_lut[NEWMV] = 1;
 }

-static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
-  int lvl;
+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl) {
+  int i;

-  // For each possible value for the loop filter fill out limits
-  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
-    // Set loop filter paramaeters that control sharpness.
-    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+  /* For each possible value for the loop filter fill out limits */
+  for (i = 0; i <= MAX_LOOP_FILTER; i++) {
+    int filt_lvl = i;
+    int block_inside_limit = 0;
+
+    /* Set loop filter paramaeters that control sharpness. */
+    block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+    block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);

    if (sharpness_lvl > 0) {
      if (block_inside_limit > (9 - sharpness_lvl))
@@ -55,19 +54,21 @@ static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
    if (block_inside_limit < 1)
      block_inside_limit = 1;

-    vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+    vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+    vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
+               SIMD_WIDTH);
+    vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
               SIMD_WIDTH);
  }
 }

-void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
+void vp9_loop_filter_init(VP9_COMMON *cm) {
  loop_filter_info_n *lfi = &cm->lf_info;
  int i;

  // init limits for given sharpness
-  update_sharpness(lfi, lf->sharpness_level);
-  lf->last_sharpness_level = lf->sharpness_level;
+  vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+  cm->last_sharpness_level = cm->sharpness_level;

  // init LUT for lvl  and hev thr picking
  lf_init_lut(lfi);
@@ -77,69 +78,98 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
 }

-void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+void vp9_loop_filter_frame_init(VP9_COMMON *cm,
+                                MACROBLOCKD *xd,
                                int default_filt_lvl) {
-  int seg_id;
+  int seg,    // segment number
+      ref,    // index in ref_lf_deltas
+      mode;   // index in mode_lf_deltas
  // n_shift is the a multiplier for lf_deltas
  // the multiplier is 1 for when filter_lvl is between 0 and 31;
  // 2 when filter_lvl is between 32 and 63
-  const int n_shift = default_filt_lvl >> 5;
-  loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *const lf = &xd->lf;
-  struct segmentation *const seg = &xd->seg;
+  int n_shift = default_filt_lvl >> 5;

-  // update limits if sharpness has changed
-  if (lf->last_sharpness_level != lf->sharpness_level) {
-    update_sharpness(lfi, lf->sharpness_level);
-    lf->last_sharpness_level = lf->sharpness_level;
+  loop_filter_info_n *lfi = &cm->lf_info;
+
+  /* update limits if sharpness has changed */
+  // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl);
+  // printf("sharpness level: %d [%d]\n",
+  //        cm->sharpness_level, cm->last_sharpness_level);
+  if (cm->last_sharpness_level != cm->sharpness_level) {
+    vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+    cm->last_sharpness_level = cm->sharpness_level;
  }

-  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
-    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
+  for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
+    int lvl_seg = default_filt_lvl;
+    int lvl_ref, lvl_mode;
+

    // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) {
-      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
-      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
-                  ? data
-                  : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
+    if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {
+      /* Abs value */
+      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
+        lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
+      } else { /* Delta Value */
+        lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
+        lvl_seg = clamp(lvl_seg, 0, 63);
+      }
    }

-    if (!lf->mode_ref_delta_enabled) {
-      // we could get rid of this if we assume that deltas are set to
-      // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
+    if (!xd->mode_ref_lf_delta_enabled) {
+      /* we could get rid of this if we assume that deltas are set to
+       * zero when not in use; encoder always uses deltas
+       */
+      vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
      continue;
    }

-    intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
-    lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+    lvl_ref = lvl_seg;

-    for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
-      for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-        const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
-                                      + (lf->mode_deltas[mode] << n_shift);
-        lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+    /* INTRA_FRAME */
+    ref = INTRA_FRAME;
+
+    /* Apply delta for reference frame */
+    lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
+
+    mode = 0; /* all the rest of Intra modes */
+    lvl_mode = lvl_ref;
+    lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
+
+    /* LAST, GOLDEN, ALT */
+    for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
+      int lvl_ref = lvl_seg;
+
+      /* Apply delta for reference frame */
+      lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
+
+      /* Apply delta for Inter modes */
+      for (mode = 0; mode < MAX_MODE_LF_DELTAS; mode++) {
+        lvl_mode = lvl_ref + (xd->mode_lf_deltas[mode] << n_shift);
+        lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
      }
+    }
  }
 }

-static int build_lfi(const loop_filter_info_n *const lfi_n,
-                     const MB_MODE_INFO *const mbmi,
-                     struct loop_filter_info *const lfi) {
-  const int seg = mbmi->segment_id;
-  const int ref = mbmi->ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
+static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi,
+                      struct loop_filter_info *lfi) {
+  const loop_filter_info_n *lfi_n = &cm->lf_info;
+  int mode = mbmi->mode;
+  int mode_index = lfi_n->mode_lf_lut[mode];
+  int seg = mbmi->segment_id;
+  int ref_frame = mbmi->ref_frame[0];
+  int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

-  if (filter_level > 0) {
+  if (filter_level) {
+    const int hev_index = filter_level >> 4;
    lfi->mblim = lfi_n->mblim[filter_level];
+    lfi->blim = lfi_n->blim[filter_level];
    lfi->lim = lfi_n->lim[filter_level];
-    lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+    lfi->hev_thr = lfi_n->hev_thr[hev_index];
    return 1;
-  } else {
-    return 0;
  }
+  return 0;
 }

 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -191,22 +221,14 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                     int only_4x4_1,
                                     const struct loop_filter_info *lfi) {
  unsigned int mask;
-  int count;

  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
-       mask; mask >>= count) {
-    count = 1;
+       mask; mask >>= 1) {
    if (mask & 1) {
      if (!only_4x4_1) {
        if (mask_16x16 & 1) {
-          if ((mask_16x16 & 3) == 3) {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 2);
-            count = 2;
-          } else {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 1);
-          }
+          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr);
          assert(!(mask_8x8 & 1));
          assert(!(mask_4x4 & 1));
          assert(!(mask_4x4_int & 1));
@@ -227,43 +249,42 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
        vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
                                        lfi->lim, lfi->hev_thr, 1);
    }
-    s += 8 * count;
-    lfi += count;
-    mask_16x16 >>= count;
-    mask_8x8 >>= count;
-    mask_4x4 >>= count;
-    mask_4x4_int >>= count;
+    s += 8;
+    lfi++;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
  }
 }

-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               const MODE_INFO *mi,
-                               int mi_row, int mi_col) {
-  const int ss_x = plane->subsampling_x;
-  const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
-  const int row_step_stride = cm->mode_info_stride * row_step;
-  struct buf_2d *const dst = &plane->dst;
+static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
+                               int plane, int mi_row, int mi_col) {
+  const int ss_x = xd->plane[plane].subsampling_x;
+  const int ss_y = xd->plane[plane].subsampling_y;
+  const int row_step = 1 << xd->plane[plane].subsampling_y;
+  const int col_step = 1 << xd->plane[plane].subsampling_x;
+  struct buf_2d * const dst = &xd->plane[plane].dst;
  uint8_t* const dst0 = dst->buf;
-  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  MODE_INFO* const mi0 = xd->mode_info_context;
+  unsigned int mask_16x16[64 / MI_SIZE] = {0};
+  unsigned int mask_8x8[64 / MI_SIZE] = {0};
+  unsigned int mask_4x4[64 / MI_SIZE] = {0};
+  unsigned int mask_4x4_int[64 / MI_SIZE] = {0};
+  struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE];
  int r, c;

-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
    unsigned int mask_16x16_c = 0;
    unsigned int mask_8x8_c = 0;
    unsigned int mask_4x4_c = 0;
    unsigned int border_mask;

    // Determine the vertical edges that need filtering
-    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+    for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO * const mi = xd->mode_info_context;
      const int skip_this = mi[c].mbmi.mb_skip_coeff
-                            && is_inter_block(&mi[c].mbmi);
+                            && mi[c].mbmi.ref_frame[0] != INTRA_FRAME;
      // left edge of current unit is block/partition edge -> no skip
      const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
          !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
@@ -272,14 +293,14 @@ static void filter_block_plane(VP9_COMMON *const cm,
      const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?
          !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
      const int skip_this_r = skip_this && !block_edge_above;
-      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[c].mbmi)
-                            : mi[c].mbmi.txfm_size;
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(&mi[c].mbmi)
+                                    : mi[c].mbmi.txfm_size;
      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

      // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(cm, &mi[c].mbmi,
+                     lfi[r] + (c >> xd->plane[plane].subsampling_x)))
        continue;

      // Build masks based on the transform size of each block
@@ -338,12 +359,13 @@ static void filter_block_plane(VP9_COMMON *const cm,
                            mask_4x4_c & border_mask,
                            mask_4x4_int[r], lfi[r]);
    dst->buf += 8 * dst->stride;
-    mi += row_step_stride;
+    xd->mode_info_context += cm->mode_info_stride * row_step;
  }

  // Now do horizontal pass
  dst->buf = dst0;
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  xd->mode_info_context = mi0;
+  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];

@@ -353,41 +375,30 @@ static void filter_block_plane(VP9_COMMON *const cm,
                             mask_4x4[r],
                             mask_4x4_int_r, mi_row + r == 0, lfi[r]);
    dst->buf += 8 * dst->stride;
+    xd->mode_info_context += cm->mode_info_stride * row_step;
  }
 }

-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
-                          VP9_COMMON *cm, MACROBLOCKD *xd,
-                          int start, int stop, int y_only) {
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+void vp9_loop_filter_frame(VP9_COMMON *cm,
+                           MACROBLOCKD *xd,
+                           int frame_filter_level,
+                           int y_only) {
  int mi_row, mi_col;

-  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
+  // Initialize the loop filter for this frame.
+  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) {
    MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;

-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) {
      int plane;

-      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
-      for (plane = 0; plane < num_planes; ++plane) {
-        filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
+      setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col);
+      for (plane = 0; plane < (y_only ? 1 : MAX_MB_PLANE); plane++) {
+        xd->mode_info_context = mi + mi_col;
+        filter_block_plane(cm, xd, plane, mi_row, mi_col);
      }
    }
  }
 }
-
-void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
-                           int frame_filter_level, int y_only) {
-  if (!frame_filter_level) return;
-  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
-  vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
-                       0, cm->mi_rows, y_only);
-}
-
-int vp9_loop_filter_worker(void *arg1, void *arg2) {
-  LFWorkerData *const lf_data = (LFWorkerData*)arg1;
-  (void)arg2;
-  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
-                       lf_data->start, lf_data->stop, lf_data->y_only);
-  return 1;
-}
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -13,39 +13,61 @@

 #include "vpx_ports/mem.h"
 #include "vpx_config.h"
-
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_seg_common.h"

 #define MAX_LOOP_FILTER 63
-#define MAX_SHARPNESS 7
-
 #define SIMD_WIDTH 16

-// Need to align this structure so when it is declared and
-// passed it can be loaded into vector registers.
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
 typedef struct {
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+                  blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
                  hev_thr[4][SIMD_WIDTH]);
-  uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
-  uint8_t mode_lf_lut[MB_MODE_COUNT];
+  unsigned char lvl[MAX_MB_SEGMENTS][4][4];
+  unsigned char mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;

+struct loop_filter_info {
+  const unsigned char *mblim;
+  const unsigned char *blim;
+  const unsigned char *lim;
+  const unsigned char *hev_thr;
+};
+
+#define prototype_loopfilter(sym) \
+  void sym(uint8_t *src, int pitch, const unsigned char *blimit, \
+           const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_block(sym) \
+  void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
+           int ystride, int uv_stride, struct loop_filter_info *lfi)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/vp9_loopfilter_x86.h"
+#endif
+
+typedef void loop_filter_uvfunction(uint8_t *u,   /* source pointer */
+                                    int p,              /* pitch */
+                                    const unsigned char *blimit,
+                                    const unsigned char *limit,
+                                    const unsigned char *thresh,
+                                    uint8_t *v);
+
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;

-void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf);
+void vp9_loop_filter_init(struct VP9Common *cm);

-// Update the loop filter for the current frame.
-// This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
-// calls this function directly.
-void vp9_loop_filter_frame_init(struct VP9Common *const cm,
-                                struct macroblockd *const xd,
+void vp9_loop_filter_frame_init(struct VP9Common *cm,
+                                struct macroblockd *mbd,
                                int default_filt_lvl);

 void vp9_loop_filter_frame(struct VP9Common *cm,
@@ -53,22 +75,11 @@ void vp9_loop_filter_frame(struct VP9Common *cm,
                           int filter_level,
                           int y_only);

-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
-                          struct VP9Common *cm, struct macroblockd *xd,
-                          int start, int stop, int y_only);
+void vp9_loop_filter_partial_frame(struct VP9Common *cm,
+                                   struct macroblockd *mbd,
+                                   int default_filt_lvl);

-typedef struct LoopFilterWorkerData {
-  const YV12_BUFFER_CONFIG *frame_buffer;
-  struct VP9Common *cm;
-  struct macroblockd xd;  // TODO(jzern): most of this is unnecessary to the
-                          // loopfilter. the planes are necessary as their state
-                          // is changed during decode.
-  int start;
-  int stop;
-  int y_only;
-} LFWorkerData;
+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl);

-// Operates on the rows described by LFWorkerData passed as 'arg1'.
-int vp9_loop_filter_worker(void *arg1, void *arg2);
 #endif  // VP9_COMMON_VP9_LOOPFILTER_H_
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -34,44 +34,17 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
  return ~mask;
 }

-static INLINE int8_t flat_mask4(uint8_t thresh,
-                                uint8_t p3, uint8_t p2,
-                                uint8_t p1, uint8_t p0,
-                                uint8_t q0, uint8_t q1,
-                                uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > thresh) * -1;
-  mask |= (abs(q1 - q0) > thresh) * -1;
-  mask |= (abs(p2 - p0) > thresh) * -1;
-  mask |= (abs(q2 - q0) > thresh) * -1;
-  mask |= (abs(p3 - p0) > thresh) * -1;
-  mask |= (abs(q3 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask5(uint8_t thresh,
-                                uint8_t p4, uint8_t p3,
-                                uint8_t p2, uint8_t p1,
-                                uint8_t p0, uint8_t q0,
-                                uint8_t q1, uint8_t q2,
-                                uint8_t q3, uint8_t q4) {
-  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  return ~mask;
-}
-
 // is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
-                              uint8_t q0, uint8_t q1) {
+static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                             uint8_t q0, uint8_t q1) {
  int8_t hev = 0;
  hev  |= (abs(p1 - p0) > thresh) * -1;
  hev  |= (abs(q1 - q0) > thresh) * -1;
  return hev;
 }

-static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1,
-                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1,
+                          uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
  int8_t filter1, filter2;

  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
@@ -95,7 +68,7 @@ static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1,
  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

  // outer tap adjustments
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+  filter = ((filter1 + 1) >> 1) & ~hev;

  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
@@ -115,8 +88,8 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
-    filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
    ++s;
  }
 }
@@ -135,30 +108,57 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
-    filter4(mask, hev, s - 2, s - 1, s, s + 1);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    filter(mask, hev, s - 2, s - 1, s, s + 1);
    s += pitch;
  }
 }

-static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
-                           uint8_t *op3, uint8_t *op2,
-                           uint8_t *op1, uint8_t *op0,
-                           uint8_t *oq0, uint8_t *oq1,
-                           uint8_t *oq2, uint8_t *oq3) {
+static INLINE int8_t flatmask4(uint8_t thresh,
+                               uint8_t p3, uint8_t p2,
+                               uint8_t p1, uint8_t p0,
+                               uint8_t q0, uint8_t q1,
+                               uint8_t q2, uint8_t q3) {
+  int8_t flat = 0;
+  flat |= (abs(p1 - p0) > thresh) * -1;
+  flat |= (abs(q1 - q0) > thresh) * -1;
+  flat |= (abs(p0 - p2) > thresh) * -1;
+  flat |= (abs(q0 - q2) > thresh) * -1;
+  flat |= (abs(p3 - p0) > thresh) * -1;
+  flat |= (abs(q3 - q0) > thresh) * -1;
+  return ~flat;
+}
+static INLINE signed char flatmask5(uint8_t thresh,
+                                    uint8_t p4, uint8_t p3, uint8_t p2,
+                                    uint8_t p1, uint8_t p0,
+                                    uint8_t q0, uint8_t q1, uint8_t q2,
+                                    uint8_t q3, uint8_t q4) {
+  int8_t flat = 0;
+  flat |= (abs(p4 - p0) > thresh) * -1;
+  flat |= (abs(q4 - q0) > thresh) * -1;
+  flat = ~flat;
+  return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+}
+
+
+static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
+                            uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0,
+                            uint8_t *oq0, uint8_t *oq1,
+                            uint8_t *oq2, uint8_t *oq3) {
+  // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line
  if (flat && mask) {
    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
  } else {
-    filter4(mask, hev, op1,  op0, oq0, oq1);
+    filter(mask, hev, op1,  op0, oq0, oq1);
  }
 }

@@ -177,10 +177,11 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,

    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                             s,         s + 1 * p, s + 2 * p, s + 3 * p);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    mbfilter(mask, hev, flat,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,         s + 1 * p, s + 2 * p, s + 3 * p);
    ++s;
  }
 }
@@ -197,24 +198,23 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
-                             s,     s + 1, s + 2, s + 3);
+    const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
+                              s,     s + 1, s + 2, s + 3);
    s += pitch;
  }
 }

-static INLINE void filter16(int8_t mask, uint8_t hev,
-                            uint8_t flat, uint8_t flat2,
-                            uint8_t *op7, uint8_t *op6,
-                            uint8_t *op5, uint8_t *op4,
-                            uint8_t *op3, uint8_t *op2,
-                            uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1,
-                            uint8_t *oq2, uint8_t *oq3,
-                            uint8_t *oq4, uint8_t *oq5,
-                            uint8_t *oq6, uint8_t *oq7) {
+static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
+                                 uint8_t flat, uint8_t flat2,
+                                 uint8_t *op7, uint8_t *op6, uint8_t *op5,
+                                 uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                                 uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                                 uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                                 uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,
+                                 uint8_t *oq7) {
+  // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line
  if (flat2 && flat && mask) {
    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@@ -222,7 +222,6 @@ static INLINE void filter16(int8_t mask, uint8_t hev,
    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;

-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0, 4);
    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
@@ -252,35 +251,35 @@ static INLINE void filter16(int8_t mask, uint8_t hev,
    *oq6 = ROUND_POWER_OF_TWO(p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
  } else {
-    filter8(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+    mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
  }
 }

 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
                                    const uint8_t *blimit,
                                    const uint8_t *limit,
-                                    const uint8_t *thresh,
-                                    int count) {
+                                    const uint8_t *thresh) {
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1,
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flatmask5(1,
                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);

-    filter16(mask, hev, flat, flat2,
-             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,         s + 1 * p, s + 2 * p, s + 3 * p,
-             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+    wide_mbfilter(mask, hev, flat, flat2,
+                  s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+                  s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                  s,         s + 1 * p, s + 2 * p, s + 3 * p,
+                  s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+
    ++s;
  }
 }
@@ -296,14 +295,14 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                    q0, s[4], s[5], s[6], s[7]);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                   q0, s[4], s[5], s[6], s[7]);

-    filter16(mask, hev, flat, flat2,
-             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+    wide_mbfilter(mask, hev, flat, flat2,
+                  s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                  s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
    s += p;
  }
 }
--- a/vp9/common/vp9_maskingmv.c
+++ b/vp9/common/vp9_maskingmv.c
@@ -0,0 +1,803 @@
+/*
+ ============================================================================
+ Name        : vp9_maskingmv.c
+ Author      : jimbankoski
+ Version     :
+ Copyright   : Your copyright notice
+ Description : Hello World in C, Ansi-style
+ ============================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+unsigned int vp9_sad16x16_sse3(
+  unsigned char *src_ptr,
+  int  src_stride,
+  unsigned char *ref_ptr,
+  int  ref_stride,
+  int  max_err);
+
+int vp8_growmaskmb_sse3(
+  unsigned char *om,
+  unsigned char *nm);
+
+void vp8_makemask_sse3(
+  unsigned char *y,
+  unsigned char *u,
+  unsigned char *v,
+  unsigned char *ym,
+  int yp,
+  int uvp,
+  int ys,
+  int us,
+  int vs,
+  int yt,
+  int ut,
+  int vt);
+
+unsigned int vp9_sad16x16_unmasked_wmt(
+  unsigned char *src_ptr,
+  int  src_stride,
+  unsigned char *ref_ptr,
+  int  ref_stride,
+  unsigned char *mask);
+
+unsigned int vp9_sad16x16_masked_wmt(
+  unsigned char *src_ptr,
+  int  src_stride,
+  unsigned char *ref_ptr,
+  int  ref_stride,
+  unsigned char *mask);
+
+unsigned int vp8_masked_predictor_wmt(
+  unsigned char *masked,
+  unsigned char *unmasked,
+  int  src_stride,
+  unsigned char *dst_ptr,
+  int  dst_stride,
+  unsigned char *mask);
+unsigned int vp8_masked_predictor_uv_wmt(
+  unsigned char *masked,
+  unsigned char *unmasked,
+  int  src_stride,
+  unsigned char *dst_ptr,
+  int  dst_stride,
+  unsigned char *mask);
+unsigned int vp8_uv_from_y_mask(
+  unsigned char *ymask,
+  unsigned char *uvmask);
+int yp = 16;
+unsigned char sxy[] = {
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
+};
+
+unsigned char sts[] = {
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+};
+unsigned char str[] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+unsigned char y[] = {
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
+};
+int uvp = 8;
+unsigned char u[] = {
+  90, 80, 70, 70, 90, 90, 90, 17,
+  90, 80, 70, 70, 90, 90, 90, 17,
+  84, 70, 70, 90, 90, 90, 17, 17,
+  84, 70, 70, 90, 90, 90, 17, 17,
+  80, 70, 70, 90, 90, 90, 17, 17,
+  90, 80, 70, 70, 90, 90, 90, 17,
+  90, 80, 70, 70, 90, 90, 90, 17,
+  90, 80, 70, 70, 90, 90, 90, 17
+};
+
+unsigned char v[] = {
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80
+};
+
+unsigned char ym[256];
+unsigned char uvm[64];
+typedef struct {
+  unsigned char y;
+  unsigned char yt;
+  unsigned char u;
+  unsigned char ut;
+  unsigned char v;
+  unsigned char vt;
+  unsigned char use;
+} COLOR_SEG_ELEMENT;
+
+/*
+COLOR_SEG_ELEMENT segmentation[]=
+{
+    { 60,4,80,17,80,10, 1},
+    { 40,4,15,10,80,10, 1},
+};
+*/
+
+COLOR_SEG_ELEMENT segmentation[] = {
+  { 79, 44, 92, 44, 237, 60, 1},
+};
+
+unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
+                         COLOR_SEG_ELEMENT sgm[],
+                         int c) {
+  COLOR_SEG_ELEMENT *s = sgm;
+  unsigned char m = 0;
+  int i;
+  for (i = 0; i < c; i++, s++)
+    m |= (abs(y - s->y) < s->yt &&
+          abs(u - s->u) < s->ut &&
+          abs(v - s->v) < s->vt ? 255 : 0);
+
+  return m;
+}
+int neighbors[256][8];
+int makeneighbors(void) {
+  int i, j;
+  for (i = 0; i < 256; i++) {
+    int r = (i >> 4), c = (i & 15);
+    int ni = 0;
+    for (j = 0; j < 8; j++)
+      neighbors[i][j] = i;
+    for (j = 0; j < 256; j++) {
+      int nr = (j >> 4), nc = (j & 15);
+      if (abs(nr - r) < 2 && abs(nc - c) < 2)
+        neighbors[i][ni++] = j;
+    }
+  }
+  return 0;
+}
+void grow_ymask(unsigned char *ym) {
+  unsigned char nym[256];
+  int i, j;
+
+  for (i = 0; i < 256; i++) {
+    nym[i] = ym[i];
+    for (j = 0; j < 8; j++) {
+      nym[i] |= ym[neighbors[i][j]];
+    }
+  }
+  for (i = 0; i < 256; i++)
+    ym[i] = nym[i];
+}
+
+void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
+                  unsigned char *ym, unsigned char *uvm,
+                  int yp, int uvp,
+                  COLOR_SEG_ELEMENT sgm[],
+                  int count) {
+  int r, c;
+  unsigned char *oym = ym;
+
+  memset(ym, 20, 256);
+  for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
+    for (c = 0; c < 8; c++) {
+      int y1 = y[c << 1];
+      int u1 = u[c];
+      int v1 = v[c];
+      int m = pixel_mask(y1, u1, v1, sgm, count);
+      uvm[c] = m;
+      ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
+      ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
+      ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
+      ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
+    }
+  grow_ymask(oym);
+}
+
+int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+               unsigned char *ym) {
+  int i, j;
+  unsigned sad = 0;
+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
+    for (j = 0; j < 16; j++)
+      if (ym[j])
+        sad += abs(src[j] - dst[j]);
+
+  return sad;
+}
+
+int compare_masks(unsigned char *sym, unsigned char *ym) {
+  int i, j;
+  unsigned sad = 0;
+  for (i = 0; i < 16; i++, sym += 16, ym += 16)
+    for (j = 0; j < 16; j++)
+      sad += (sym[j] != ym[j] ? 1 : 0);
+
+  return sad;
+}
+
+int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+                 unsigned char *ym) {
+  int i, j;
+  unsigned sad = 0;
+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
+    for (j = 0; j < 16; j++)
+      if (!ym[j])
+        sad += abs(src[j] - dst[j]);
+
+  return sad;
+}
+
+int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
+                         int yp, int uvp,
+                         unsigned char *dy, unsigned char *du, unsigned char *dv,
+                         int dyp, int duvp,
+                         COLOR_SEG_ELEMENT sgm[],
+                         int count,
+                         int *mi,
+                         int *mj,
+                         int *ui,
+                         int *uj,
+                         int *wm) {
+  int i, j;
+
+  unsigned char ym[256];
+  unsigned char uvm[64];
+  unsigned char dym[256];
+  unsigned char duvm[64];
+  unsigned int e = 0;
+  int beste = 256;
+  int bmi = -32, bmj = -32;
+  int bui = -32, buj = -32;
+  int beste1 = 256;
+  int bmi1 = -32, bmj1 = -32;
+  int bui1 = -32, buj1 = -32;
+  int obeste;
+
+  // first try finding best mask and then unmasked
+  beste = 0xffffffff;
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
+
+      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  // bui=0;buj=0;
+  // best mv masked destination
+  make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
+               dym, duvm, dyp, duvp, sgm, count);
+
+  obeste = beste;
+  beste = 0xffffffff;
+
+  // find best masked
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = masked_sad(y, yp, dyz + j, dyp, dym);
+
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+  beste1 = beste + obeste;
+  bmi1 = bmi;
+  bmj1 = bmj;
+  bui1 = bui;
+  buj1 = buj;
+
+  beste = 0xffffffff;
+  // source mask
+  make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
+
+  // find best mask
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
+
+      e = compare_masks(ym, dym);
+
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+
+
+  // best mv masked destination
+  make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
+               dym, duvm, dyp, duvp, sgm, count);
+
+  obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
+
+  beste = 0xffffffff;
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  beste += obeste;
+
+
+  if (beste < beste1) {
+    *mi = bmi;
+    *mj = bmj;
+    *ui = bui;
+    *uj = buj;
+    *wm = 1;
+  } else {
+    *mi = bmi1;
+    *mj = bmj1;
+    *ui = bui1;
+    *uj = buj1;
+    *wm = 0;
+
+  }
+  return 0;
+}
+
+int predict(unsigned char *src, int p, unsigned char *dst, int dp,
+            unsigned char *ym, unsigned char *prd) {
+  int i, j;
+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
+    for (j = 0; j < 16; j++)
+      prd[j] = (ym[j] ? src[j] : dst[j]);
+  return 0;
+}
+
+int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
+                              int yp, int uvp,
+                              unsigned char *dy, unsigned char *du, unsigned char *dv,
+                              int dyp, int duvp,
+                              COLOR_SEG_ELEMENT sgm[],
+                              int count,
+                              int *mi,
+                              int *mj,
+                              int *ui,
+                              int *uj,
+                              int *wm) {
+  int i, j;
+
+  unsigned char ym[256];
+  unsigned char ym2[256];
+  unsigned char uvm[64];
+  unsigned char dym2[256];
+  unsigned char dym[256];
+  unsigned char duvm[64];
+  unsigned int e = 0;
+  int beste = 256;
+  int bmi = -32, bmj = -32;
+  int bui = -32, buj = -32;
+  int beste1 = 256;
+  int bmi1 = -32, bmj1 = -32;
+  int bui1 = -32, buj1 = -32;
+  int obeste;
+
+  // first try finding best mask and then unmasked
+  beste = 0xffffffff;
+
+#if 0
+  for (i = 0; i < 16; i++) {
+    unsigned char *dy = i * yp + y;
+    for (j = 0; j < 16; j++)
+      printf("%2x", dy[j]);
+    printf("\n");
+  }
+  printf("\n");
+
+  for (i = -32; i < 48; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 48; j++)
+      printf("%2x", dyz[j]);
+    printf("\n");
+  }
+#endif
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
+                        sgm[0].y, sgm[0].u, sgm[0].v,
+                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+      vp8_growmaskmb_sse3(dym, dym2);
+
+      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  // bui=0;buj=0;
+  // best mv masked destination
+
+  vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
+                    dym, dyp, duvp,
+                    sgm[0].y, sgm[0].u, sgm[0].v,
+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(dym, dym2);
+
+  obeste = beste;
+  beste = 0xffffffff;
+
+  // find best masked
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+  beste1 = beste + obeste;
+  bmi1 = bmi;
+  bmj1 = bmj;
+  bui1 = bui;
+  buj1 = buj;
+
+  // source mask
+  vp8_makemask_sse3(y, u, v,
+                    ym, yp, uvp,
+                    sgm[0].y, sgm[0].u, sgm[0].v,
+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(ym, ym2);
+
+  // find best mask
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
+                        sgm[0].y, sgm[0].u, sgm[0].v,
+                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+      vp8_growmaskmb_sse3(dym, dym2);
+
+      e = compare_masks(ym2, dym2);
+
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+
+  vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
+                    dym, dyp, duvp,
+                    sgm[0].y, sgm[0].u, sgm[0].v,
+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(dym, dym2);
+
+  obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
+
+  beste = 0xffffffff;
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  beste += obeste;
+
+  if (beste < beste1) {
+    *mi = bmi;
+    *mj = bmj;
+    *ui = bui;
+    *uj = buj;
+    *wm = 1;
+  } else {
+    *mi = bmi1;
+    *mj = bmj1;
+    *ui = bui1;
+    *uj = buj1;
+    *wm = 0;
+    beste = beste1;
+
+  }
+  return beste;
+}
+
+int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
+                int ymp, int uvmp,
+                unsigned char *yp, unsigned char *up, unsigned char *vp,
+                int ypp, int uvpp,
+                COLOR_SEG_ELEMENT sgm[],
+                int count,
+                int mi,
+                int mj,
+                int ui,
+                int uj,
+                int wm) {
+  int i, j;
+  unsigned char dym[256];
+  unsigned char dym2[256];
+  unsigned char duvm[64];
+  unsigned char *yu = ym, *uu = um, *vu = vm;
+
+  unsigned char *dym3 = dym2;
+
+  ym += mi * ymp + mj;
+  um += mi / 2 * uvmp + mj / 2;
+  vm += mi / 2 * uvmp + mj / 2;
+
+  yu += ui * ymp + uj;
+  uu += ui / 2 * uvmp + uj / 2;
+  vu += ui / 2 * uvmp + uj / 2;
+
+  // best mv masked destination
+  if (wm)
+    vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
+                      sgm[0].y, sgm[0].u, sgm[0].v,
+                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
+  else
+    vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
+                      sgm[0].y, sgm[0].u, sgm[0].v,
+                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(dym, dym2);
+  vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
+  vp8_uv_from_y_mask(dym3, duvm);
+  vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
+  vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
+
+  return 0;
+}
+
+unsigned char f0p[1280 * 720 * 3 / 2];
+unsigned char f1p[1280 * 720 * 3 / 2];
+unsigned char prd[1280 * 720 * 3 / 2];
+unsigned char msk[1280 * 720 * 3 / 2];
+
+
+int mainz(int argc, char *argv[]) {
+
+  FILE *f = fopen(argv[1], "rb");
+  FILE *g = fopen(argv[2], "wb");
+  int w = atoi(argv[3]), h = atoi(argv[4]);
+  int y_stride = w, uv_stride = w / 2;
+  int r, c;
+  unsigned char *f0 = f0p, *f1 = f1p, *t;
+  unsigned char ym[256], uvm[64];
+  unsigned char ym2[256], uvm2[64];
+  unsigned char ym3[256], uvm3[64];
+  int a, b;
+
+  COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
+#if 0
+  makeneighbors();
+  COLOR_SEG_ELEMENT segmentation[] = {
+    { 60, 4, 80, 17, 80, 10, 1},
+    { 40, 4, 15, 10, 80, 10, 1},
+  };
+  make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
+
+  vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
+                    (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
+                    segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
+
+  vp8_growmaskmb_sse3(ym, ym3);
+
+  a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
+  b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
+
+  vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
+
+  vp8_uv_from_y_mask(ym3, uvm3);
+
+  return 4;
+#endif
+  makeneighbors();
+
+
+  memset(prd, 128, w * h * 3 / 2);
+
+  fread(f0, w * h * 3 / 2, 1, f);
+
+  while (!feof(f)) {
+    unsigned char *ys = f1, *yd = f0, *yp = prd;
+    unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
+    unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
+    fread(f1, w * h * 3 / 2, 1, f);
+
+    ys += 32 * y_stride;
+    yd += 32 * y_stride;
+    yp += 32 * y_stride;
+    us += 16 * uv_stride;
+    ud += 16 * uv_stride;
+    up += 16 * uv_stride;
+    vs += 16 * uv_stride;
+    vd += 16 * uv_stride;
+    vp += 16 * uv_stride;
+    for (r = 32; r < h - 32; r += 16,
+         ys += 16 * w, yd += 16 * w, yp += 16 * w,
+         us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
+         vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
+      for (c = 32; c < w - 32; c += 16) {
+        int mi, mj, ui, uj, wm;
+        int bmi, bmj, bui, buj, bwm;
+        unsigned char ym[256];
+
+        if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
+          bmi = bmj = bui = buj = bwm = 0;
+        else {
+          COLOR_SEG_ELEMENT cs[5];
+          int j;
+          unsigned int beste = 0xfffffff;
+          unsigned int bestj = 0;
+
+          // try color from last mb segmentation
+          cs[0] = last;
+
+          // try color segs from 4 pixels in mb recon as segmentation
+          cs[1].y = yd[c + y_stride + 1];
+          cs[1].u = ud[c / 2 + uv_stride];
+          cs[1].v = vd[c / 2 + uv_stride];
+          cs[1].yt = cs[1].ut = cs[1].vt = 20;
+          cs[2].y = yd[c + w + 14];
+          cs[2].u = ud[c / 2 + uv_stride + 7];
+          cs[2].v = vd[c / 2 + uv_stride + 7];
+          cs[2].yt = cs[2].ut = cs[2].vt = 20;
+          cs[3].y = yd[c + w * 14 + 1];
+          cs[3].u = ud[c / 2 + uv_stride * 7];
+          cs[3].v = vd[c / 2 + uv_stride * 7];
+          cs[3].yt = cs[3].ut = cs[3].vt = 20;
+          cs[4].y = yd[c + w * 14 + 14];
+          cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
+          cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
+          cs[4].yt = cs[4].ut = cs[4].vt = 20;
+
+          for (j = 0; j < 5; j++) {
+            int e;
+
+            e = fast_masked_motion_search(
+                  ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
+                  yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
+                  &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
+
+            if (e < beste) {
+              bmi = mi;
+              bmj = mj;
+              bui = ui;
+              buj = uj, bwm = wm;
+              bestj = j;
+              beste = e;
+            }
+          }
+          best = cs[bestj];
+          // best = segmentation[0];
+          last = best;
+        }
+        predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
+                    yp + c, up + c / 2, vp + c / 2, w, uv_stride,
+                    &best, 1, bmi, bmj, bui, buj, bwm);
+
+      }
+    }
+    fwrite(prd, w * h * 3 / 2, 1, g);
+    t = f0;
+    f0 = f1;
+    f1 = t;
+
+  }
+  fclose(f);
+  fclose(g);
+  return 0;
+}
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/vp9_blockd.h"
+
+void vp9_setup_block_dptrs(MACROBLOCKD *mb,
+                           int subsampling_x, int subsampling_y) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
+    mb->plane[i].subsampling_x = i ? subsampling_x : 0;
+    mb->plane[i].subsampling_y = i ? subsampling_y : 0;
+  }
+#if CONFIG_ALPHA
+  // TODO(jkoleszar): Using the Y w/h for now
+  mb->plane[3].subsampling_x = 0;
+  mb->plane[3].subsampling_y = 0;
+#endif
+}
--- a/vp9/common/vp9_modecont.c
+++ b/vp9/common/vp9_modecont.c
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/vp9_modecont.h"
+
+const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
+                                           [VP9_INTER_MODES - 1] = {
+  {2,       173,   34},  // 0 = both zero mv
+  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
+  {7,       166,   63},  // 2 = two predicted mvs
+  {7,       94,    66},  // 3 = one predicted/zero and one new mv
+  {8,       64,    46},  // 4 = two new mvs
+  {17,      81,    31},  // 5 = one intra neighbour + x
+  {25,      29,    30},  // 6 = two intra neighbours
+};
--- a/vp9/common/vp9_modecont.h
+++ b/vp9/common/vp9_modecont.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MODECONT_H_
+#define VP9_COMMON_VP9_MODECONT_H_
+
+#include "vp9/common/vp9_entropy.h"
+
+extern const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
+                                                  [VP9_INTER_MODES - 1];
+
+#endif  // VP9_COMMON_VP9_MODECONT_H_
--- a/vp9/common/vp9_modecontext.c
+++ b/vp9/common/vp9_modecontext.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/vp9_entropymode.h"
+
+const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES - 1] = {
+  { /* above = dc */
+    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
+    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
+    {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
+    {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
+    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
+    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
+    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
+    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
+  }, { /* above = v */
+    {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
+    {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
+    {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
+    {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
+    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
+    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
+    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
+    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
+  }, { /* above = h */
+    {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
+    {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
+    {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
+    {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
+    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
+    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
+    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
+    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
+  }, { /* above = d45 */
+    { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
+    {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
+    {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
+    {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
+    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
+    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
+    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
+    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
+  }, { /* above = d135 */
+    {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
+    {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
+    {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
+    {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
+    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
+    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
+    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
+    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
+  }, { /* above = d117 */
+    {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
+    {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
+    {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
+    {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
+    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
+    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
+    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
+    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
+  }, { /* above = d153 */
+    {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
+    {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
+    {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
+    {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
+    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
+    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
+    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
+    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
+  }, { /* above = d27 */
+    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
+    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
+    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
+    {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
+    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
+    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
+    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
+    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
+  }, { /* above = d63 */
+    {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
+    {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
+    {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
+    {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
+    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
+    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
+    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
+    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
+  }, { /* above = tm */
+    {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
+    {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
+    {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
+    {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
+    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
+    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
+    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
+    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
+  }
+};
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -13,8 +13,6 @@

 #include "vpx/vpx_integer.h"

-#include "vp9/common/vp9_common.h"
-
 typedef struct {
  int16_t row;
  int16_t col;
@@ -25,15 +23,14 @@ typedef union int_mv {
  MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */

-typedef struct {
+struct mv32 {
  int32_t row;
  int32_t col;
-} MV32;
+};

-static void clamp_mv(MV *mv, int min_col, int max_col,
-                             int min_row, int max_row) {
-  mv->col = clamp(mv->col, min_col, max_col);
-  mv->row = clamp(mv->row, min_row, max_row);
-}
+typedef union int_mv32 {
+  uint64_t    as_int;
+  struct mv32 as_mv;
+} int_mv32; /* facilitates faster equality tests and copies */

 #endif  // VP9_COMMON_VP9_MV_H_
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,285 +11,296 @@
 #include "vp9/common/vp9_mvref_common.h"

 #define MVREF_NEIGHBOURS 8
-
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[MB_MODE_COUNT] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D27_PRED
-  9,  // D63_PRED
-  9,  // TM_PRED
-  0,  // NEARESTMV
-  0,  // NEARMV
-  3,  // ZEROMV
-  1,  // NEWMV
-};
-
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,  // 0
-  NEW_PLUS_NON_INTRA,  // 1
-  BOTH_NEW,  // 2
-  ZERO_PLUS_PREDICTED,  // 3
-  NEW_PLUS_NON_INTRA,  // 4
-  INVALID_CASE,  // 5
-  BOTH_ZERO,  // 6
-  INVALID_CASE,  // 7
-  INVALID_CASE,  // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,  // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,  // 13
-  INVALID_CASE,  // 14
-  INVALID_CASE,  // 15
-  INVALID_CASE,  // 16
-  INVALID_CASE,  // 17
-  BOTH_INTRA  // 18
-};
-
-static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
-  // 4X4
+static int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
+  // SB4X4
  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // 4X8
+  // SB4X8
  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // 8X4
+  // SB8X4
  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // 8X8
+  // SB8X8
  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // 8X16
+  // SB8X16
  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // 16X8
+  // SB16X8
  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // 16X16
+  // SB16X16
  {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 16X32
+  // SB16X32
  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X16
+  // SB32X16
  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X32
+  // SB32X32
  {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X64
+  // SB32X64
  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // 64X32
+  // SB64X32
  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // 64X64
+  // SB64X64
  {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
 };
-
-static const int idx_n_column_to_subblock[4][2] = {
-  {1, 2},
-  {1, 3},
-  {3, 2},
-  {3, 3}
-};
-
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units

-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
+static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
+  mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER,
+                                       xd->mb_to_right_edge + MV_BORDER);
+  mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER,
+                                       xd->mb_to_bottom_edge + MV_BORDER);
 }

-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
-                                      int check_sub_blocks, int which_mv,
-                                      int search_col, int block_idx) {
-  return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
-          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
-              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv]);
+// Gets a candidate reference motion vector from the given mode info
+// structure if one exists that matches the given reference frame.
+static int get_matching_candidate(const MODE_INFO *candidate_mi,
+                                  MV_REFERENCE_FRAME ref_frame,
+                                  int_mv *c_mv, int block_idx) {
+  if (ref_frame == candidate_mi->mbmi.ref_frame[0]) {
+    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int;
+    else
+      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+  } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) {
+    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int;
+    else
+      c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+  } else {
+    return 0;
+  }
+
+  return 1;
+}
+
+// Gets candidate reference motion vector(s) from the given mode info
+// structure if they exists and do NOT match the given reference frame.
+static void get_non_matching_candidates(const MODE_INFO *candidate_mi,
+                                        MV_REFERENCE_FRAME ref_frame,
+                                        MV_REFERENCE_FRAME *c_ref_frame,
+                                        int_mv *c_mv,
+                                        MV_REFERENCE_FRAME *c2_ref_frame,
+                                        int_mv *c2_mv) {
+
+  c_mv->as_int = 0;
+  c2_mv->as_int = 0;
+  *c_ref_frame = INTRA_FRAME;
+  *c2_ref_frame = INTRA_FRAME;
+
+  // If first candidate not valid neither will be.
+  if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) {
+    // First candidate
+    if (candidate_mi->mbmi.ref_frame[0] != ref_frame) {
+      *c_ref_frame = candidate_mi->mbmi.ref_frame[0];
+      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+    }
+
+    // Second candidate
+    if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) &&
+        (candidate_mi->mbmi.ref_frame[1] != ref_frame) &&
+        (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) {
+      *c2_ref_frame = candidate_mi->mbmi.ref_frame[1];
+      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+    }
+  }
 }


 // Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv return_mv = candidate->mbmi.mv[which_mv];
+static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame,
+                     MV_REFERENCE_FRAME candidate_ref_frame,
+                     int_mv *candidate_mv, int *ref_sign_bias) {

  // Sign inversion where appropriate.
-  if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] !=
-      ref_sign_bias[this_ref_frame]) {
-    return_mv.as_mv.row *= -1;
-    return_mv.as_mv.col *= -1;
+  if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
+    candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
+    candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
  }
-  return return_mv;
 }

-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(MV) \
-  if (refmv_count) { \
-    if ((MV).as_int != mv_ref_list[0].as_int) { \
-      mv_ref_list[refmv_count] = (MV); \
-      goto Done; \
-    } \
-  } else { \
-    mv_ref_list[refmv_count++] = (MV); \
+// Add a candidate mv.
+// Discard if it has already been seen.
+static void add_candidate_mv(int_mv *mv_list,  int *mv_scores,
+                             int *candidate_count, int_mv candidate_mv,
+                             int weight) {
+  if (*candidate_count == 0) {
+    mv_list[0].as_int = candidate_mv.as_int;
+    mv_scores[0] = weight;
+    *candidate_count += 1;
+  } else if ((*candidate_count == 1) &&
+             (candidate_mv.as_int != mv_list[0].as_int)) {
+    mv_list[1].as_int = candidate_mv.as_int;
+    mv_scores[1] = weight;
+    *candidate_count += 1;
  }
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
-  if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \
-    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
-  } \
-  if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \
-      (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \
-      (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \
-    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
-  }
-
-// Checks that the given mi_row, mi_col and search point
-// are inside the borders of the tile.
-static INLINE int is_inside(int mi_col, int mi_row, int cur_tile_mi_col_start,
-                            const int mv_ref[2]) {
-  // Check that the candidate is within the border.  We only need to check
-  // the left side because all the positive right side ones are for blocks that
-  // are large enough to support the + value they have within their border.
-  return !(mi_row + mv_ref[1] < 0 ||
-           mi_col + mv_ref[0] < cur_tile_mi_col_start);
 }

 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
+//
 void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
-                          const MODE_INFO *lf_here,
-                          const MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list, const int *ref_sign_bias,
-                          const int block_idx,
-                          const int mi_row, const int mi_col) {
-  int idx;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+                          MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list, int *ref_sign_bias,
+                          int block_idx) {
+  int i;
+  MODE_INFO *candidate_mi;
+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  int_mv c_refmv;
+  int_mv c2_refmv;
+  MV_REFERENCE_FRAME c_ref_frame;
+  MV_REFERENCE_FRAME c2_ref_frame;
+  int candidate_scores[MAX_MV_REF_CANDIDATES];
  int refmv_count = 0;
-  const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
-  const MODE_INFO *candidate;
-  const int check_sub_blocks = block_idx >= 0;
-  int different_ref_found = 0;
-  int context_counter = 0;
+  int split_count = 0;
+  int (*mv_ref_search)[2];
+  const int mi_col = get_mi_col(xd);
+  const int mi_row = get_mi_row(xd);
+  int intra_count = 0;
+  int zero_count = 0;
+  int newmv_count = 0;
+  int x_idx = 0, y_idx = 0;

-  // Blank the reference vector list
-  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+  // Blank the reference vector lists and other local structures.
+  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
+  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));

-  // The nearest 2 blocks are treated differently
-  // if the size < 8x8 we get the mv from the bmi substructure,
-  // and we also need to keep a mode count.
-  for (idx = 0; idx < 2; ++idx) {
-    const int *mv_ref = mv_ref_search[idx];
+  mv_ref_search = mv_ref_blocks[mbmi->sb_type];
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+    x_idx = block_idx & 1;
+    y_idx = block_idx >> 1;
+  }

-    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
-      continue;
+  // We first scan for candidate vectors that match the current reference frame
+  // Look at nearest neigbours
+  for (i = 0; i < 2; ++i) {
+    const int mi_search_col = mi_col + mv_ref_search[i][0];
+    const int mi_search_row = mi_row + mv_ref_search[i][1];
+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
+        (mi_search_col < cm->cur_tile_mi_col_end) &&
+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
+      int b;

-    candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);

-    // Keep counts for entropy encoding.
-    context_counter += mode_2_counter[candidate->mbmi.mode];
+      if (block_idx >= 0) {
+        if (mv_ref_search[i][0])
+          b = 1 + y_idx * 2;
+        else
+          b = 2 + x_idx;
+      } else {
+        b = -1;
+      }
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) {
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c_refmv, 16);
+      }
+      split_count += (candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 &&
+                      candidate_mi->mbmi.ref_frame[0] != INTRA_FRAME);

-    // Check if the candidate comes from the same reference frame.
-    if (candidate->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0,
-                                       mv_ref[0], block_idx));
-      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
-    } else {
-      different_ref_found = 1;
-      if (candidate->mbmi.ref_frame[1] == ref_frame) {
-        // Add second motion vector if it has the same ref_frame.
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1,
-                                         mv_ref[0], block_idx));
+      // Count number of neihgbours coded intra and zeromv
+      intra_count += (candidate_mi->mbmi.mode < NEARESTMV);
+      zero_count += (candidate_mi->mbmi.mode == ZEROMV);
+      newmv_count += (candidate_mi->mbmi.mode >= NEWMV);
+    }
+  }
+
+  // More distant neigbours
+  for (i = 2; (i < MVREF_NEIGHBOURS) &&
+              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
+    const int mi_search_col = mi_col + mv_ref_search[i][0];
+    const int mi_search_row = mi_row + mv_ref_search[i][1];
+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
+        (mi_search_col < cm->cur_tile_mi_col_end) &&
+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);
+
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c_refmv, 16);
      }
    }
  }

-  // Check the rest of the neighbors in much the same way
-  // as before except we don't need to keep track of sub blocks or
-  // mode counts.
-  for (; idx < MVREF_NEIGHBOURS; ++idx) {
-    const int *mv_ref = mv_ref_search[idx];
-    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
-      continue;
+  // Look in the last frame if it exists
+  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
+    candidate_mi = lf_here;
+    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
+      add_candidate_mv(mv_ref_list, candidate_scores,
+                       &refmv_count, c_refmv, 16);
+    }
+  }

-    candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
+  // If we have not found enough candidates consider ones where the
+  // reference frame does not match. Break out when we have
+  // MAX_MV_REF_CANDIDATES candidates.
+  // Look first at spatial neighbours
+  for (i = 0; (i < MVREF_NEIGHBOURS) &&
+              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
+    const int mi_search_col = mi_col + mv_ref_search[i][0];
+    const int mi_search_row = mi_row + mv_ref_search[i][1];
+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
+        (mi_search_col < cm->cur_tile_mi_col_end) &&
+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);

-    if (candidate->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(candidate->mbmi.mv[0]);
-      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
-    } else {
-      different_ref_found = 1;
-      if (candidate->mbmi.ref_frame[1] == ref_frame) {
-        ADD_MV_REF_LIST(candidate->mbmi.mv[1]);
+      get_non_matching_candidates(candidate_mi, ref_frame,
+                                  &c_ref_frame, &c_refmv,
+                                  &c2_ref_frame, &c2_refmv);
+
+      if (c_ref_frame != INTRA_FRAME) {
+        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c_refmv, 1);
+      }
+
+      if (c2_ref_frame != INTRA_FRAME) {
+        scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c2_refmv, 1);
      }
    }
  }

-  // Check the last frame's mode and mv info.
-  if (lf_here != NULL) {
-    if (lf_here->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(lf_here->mbmi.mv[0]);
-    } else if (lf_here->mbmi.ref_frame[1] == ref_frame) {
-      ADD_MV_REF_LIST(lf_here->mbmi.mv[1]);
+  // Look at the last frame if it exists
+  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
+    candidate_mi = lf_here;
+    get_non_matching_candidates(candidate_mi, ref_frame,
+                                &c_ref_frame, &c_refmv,
+                                &c2_ref_frame, &c2_refmv);
+
+    if (c_ref_frame != INTRA_FRAME) {
+      scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
+      add_candidate_mv(mv_ref_list, candidate_scores,
+                       &refmv_count, c_refmv, 1);
+    }
+
+    if (c2_ref_frame != INTRA_FRAME) {
+      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
+      add_candidate_mv(mv_ref_list, candidate_scores,
+                       &refmv_count, c2_refmv, 1);
    }
  }

-  // Since we couldn't find 2 mvs from the same reference frame
-  // go back through the neighbors and find motion vectors from
-  // different reference frames.
-  if (different_ref_found) {
-    for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) {
-      const int *mv_ref = mv_ref_search[idx];
-      if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, mv_ref))
-        continue;
-
-      candidate = here + mv_ref[0] + mv_ref[1] * xd->mode_info_stride;
-
-      // If the candidate is INTRA we don't want to consider its mv.
-      if (!is_inter_block(&candidate->mbmi))
-        continue;
-
-      IF_DIFF_REF_FRAME_ADD_MV(candidate);
+  if (!intra_count) {
+    if (!newmv_count) {
+      // 0 = both zero mv
+      // 1 = one zero mv + one a predicted mv
+      // 2 = two predicted mvs
+      mbmi->mb_mode_context[ref_frame] = 2 - zero_count;
+    } else {
+      // 3 = one predicted/zero and one new mv
+      // 4 = two new mvs
+      mbmi->mb_mode_context[ref_frame] = 2 + newmv_count;
    }
+  } else {
+    // 5 = one intra neighbour + x
+    // 6 = two intra neighbours
+    mbmi->mb_mode_context[ref_frame] = 4 + intra_count;
  }

-  // Since we still don't have a candidate we'll try the last frame.
-  if (lf_here != NULL && is_inter_block(&lf_here->mbmi)) {
-    IF_DIFF_REF_FRAME_ADD_MV(lf_here);
-  }
-
- Done:
-
-  mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter];
-
  // Clamp vectors
-  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
-    clamp_mv_ref(&mv_ref_list[idx].as_mv, xd);
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    clamp_mv_ref(xd, &mv_ref_list[i]);
+  }
 }
-
-#undef ADD_MV_REF_LIST
-#undef IF_DIFF_REF_FRAME_ADD_MV
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -17,13 +17,11 @@
 void vp9_find_mv_refs_idx(VP9_COMMON *cm,
                          MACROBLOCKD *xd,
                          MODE_INFO *here,
-                          const MODE_INFO *lf_here,
-                          const MV_REFERENCE_FRAME ref_frame,
+                          MODE_INFO *lf_here,
+                          MV_REFERENCE_FRAME ref_frame,
                          int_mv *mv_ref_list,
-                          const int *ref_sign_bias,
-                          const int block_idx,
-                          const int mi_row,
-                          const int mi_col);
+                          int *ref_sign_bias,
+                          int block_idx);

 static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
                                    MACROBLOCKD *xd,
@@ -31,10 +29,9 @@ static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
                                    MODE_INFO *lf_here,
                                    MV_REFERENCE_FRAME ref_frame,
                                    int_mv *mv_ref_list,
-                                    int *ref_sign_bias,
-                                    int mi_row, int mi_col) {
+                                    int *ref_sign_bias) {
  vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
-                       mv_ref_list, ref_sign_bias, -1, mi_row, mi_col);
+                       mv_ref_list, ref_sign_bias, -1);
 }

 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ami Fischman	28147a449a	libvpx: enable building for iOS devices (armv7) Allow output of gas syntax assembly directly from obj_int_extract Change-Id: I33a747e87ef1c590a8766dea17f8cb2497e54591	2013-07-19 14:05:59 -07:00
Ronald S. Bultje	33149cbb4c	Replace generated quant tables with static lookup tables. This prevents possible float rounding issues between architectures. Change-Id: I6ed260aebd49feb4cfb5596a5370c44be5f72167	2013-07-16 14:04:41 -07:00
John Koleszar	3f454060bb	Fix above context pointers In the prior code, the above context pointers used for entropy decoding were initialized on the first frame, and not updated when the frame size changed. The per-frame code which initializes the contexts assumes that the contexts are contiguous, leading to an incomplete initialization when the frame is smaller. This commit updates the pointers so that the context is contigous whenever the frame size changes. Conflicts: vp9/common/vp9_alloccommon.c Change-Id: I08b53e3a30c8289491212311682ff1b8028cff6c	2013-07-16 14:04:41 -07:00
Yaowu Xu	d19ed5f249	Change to extend full border only when needed This is a short term optimization till we work out a decoder implementation requiring no frame border extension. Change-Id: I02d15bfde4d926b50a4e58b393d8c4062d1be70f	2013-07-16 14:04:39 -07:00
Ronald S. Bultje	a801f7a295	Increase border size from 96 to 160. This is required because upon downscaling, if a motion vector points partially into the UMV (e.g. all minus 1 of 64+7 pixels, i.e. 70), then we can point up to 140 pixels into the larger-resolution (2x) reference buffer UMV, which means the UMV for reference buffers in downscaling needs to be 140 rounded up to the nearest multiple of 32, i.e. 160. Longer-term, we should probably handle the UMV differently by detecting edge coverage on-the-fly and using a temporary buffer for edge extensions instead of adding 160 pixels on all sides of the image (which means a CIF image uses 3x its own area size for borders). Change-Id: I5184443e6731cd6721fc6a5d430a53e7d91b4f7e	2013-07-16 12:41:10 -07:00
Dmitry Kovalev	e39bd6407f	Fixing vp9_get_pred_context_comp_ref_p function. Adding missed parenthesis around boolean expressions. Bitstream is changed. Regenerating test vectors. Conflicts: vp9/common/vp9_pred_common.c Change-Id: I4cc00b761e9473f92f180a9fc3a0c607f0aaae56	2013-07-16 12:40:48 -07:00