Cosmetic changes in the supertx expt

Converts most negative !CONFIG_SUPERTX checks to positive ones. Change-Id: I80b7f8c5d3483a7861f0de7fc7ebc425b9c68766
Fixing rd loop bugs in supertx+ext_tx experiment
2014-11-10 17:34:00 -08:00 · 2014-11-07 16:43:12 -08:00 · 2014-10-20 21:52:55 -07:00 · 2014-10-20 14:51:20 -07:00 · 2014-10-14 15:59:17 -07:00 · 2014-09-03 11:22:36 -07:00
138 changed files with 12466 additions and 2915 deletions
--- a/3
+++ b/3
@@ -62,6 +62,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv7-none-rvct
    armv7-win32-vs11
    armv7-win32-vs12
+    armv7s-darwin-gcc
    mips32-linux-gcc
    ppc32-darwin8-gcc
    ppc32-darwin9-gcc
@@ -79,6 +80,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-darwin11-gcc
    x86-darwin12-gcc
    x86-darwin13-gcc
+    x86-iphonesimulator-gcc
    x86-linux-gcc
    x86-linux-icc
    x86-os2-gcc
@@ -95,6 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-darwin11-gcc
    x86_64-darwin12-gcc
    x86_64-darwin13-gcc
+    x86_64-iphonesimulator-gcc
    x86_64-linux-gcc
    x86_64-linux-icc
    x86_64-solaris-gcc
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -330,7 +330,10 @@ endef
 ifneq ($(target),)
 include $(SRC_PATH_BARE)/$(target:-$(TOOLCHAIN)=).mk
 endif
-ifeq ($(filter %clean,$(MAKECMDGOALS)),)
+
+skip_deps := $(filter %clean,$(MAKECMDGOALS))
+skip_deps += $(findstring testdata,$(MAKECMDGOALS))
+ifeq ($(strip $(skip_deps)),)
  # Older versions of make don't like -include directives with no arguments
  ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
    -include $(filter %.d,$(OBJS-yes:.o=.d))
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -802,7 +802,7 @@ process_common_toolchain() {
        armv8)
            soft_enable neon
            ;;
-        armv7)
+        armv7|armv7s)
            soft_enable neon
            soft_enable neon_asm
            soft_enable media
@@ -831,7 +831,7 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if [ ${tgt_isa} = "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
                if [ -z "${float_abi}" ]; then
                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -137,7 +137,9 @@ for opt in "$@"; do
        ;;
        --lib) proj_kind="lib"
        ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
        ;;
        --static-crt) use_static_runtime=true
        ;;
@@ -151,9 +153,9 @@ for opt in "$@"; do
            esac
        ;;
        -I*)
-            opt="${opt%/}"
            opt=${opt##-I}
            opt=$(fix_path "$opt")
+            opt="${opt%/}"
            incs="${incs}${incs:+;}&quot;${opt}&quot;"
            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
        ;;
@@ -414,7 +416,7 @@ generate_vcproj() {
                    vpx)
                        tag Tool \
                            Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
+                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \

                        tag Tool \
                            Name="VCCLCompilerTool" \
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -157,7 +157,9 @@ for opt in "$@"; do
        ;;
        --lib) proj_kind="lib"
        ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
        ;;
        --static-crt) use_static_runtime=true
        ;;
@@ -173,9 +175,9 @@ for opt in "$@"; do
            esac
        ;;
        -I*)
-            opt="${opt%/}"
            opt=${opt##-I}
            opt=$(fix_path "$opt")
+            opt="${opt%/}"
            incs="${incs}${incs:+;}&quot;${opt}&quot;"
            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
        ;;
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -0,0 +1,244 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##
+## This script generates 'VPX.framework'. An iOS app can encode and decode VPx
+## video by including 'VPX.framework'.
+##
+## Run iosbuild.sh to create 'VPX.framework' in the current directory.
+##
+set -e
+devnull='> /dev/null 2>&1'
+
+BUILD_ROOT="_iosbuild"
+DIST_DIR="_dist"
+FRAMEWORK_DIR="VPX.framework"
+HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
+MAKE_JOBS=1
+LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
+LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+ORIG_PWD="$(pwd)"
+TARGETS="armv6-darwin-gcc
+         armv7-darwin-gcc
+         armv7s-darwin-gcc
+         x86-iphonesimulator-gcc
+         x86_64-iphonesimulator-gcc"
+
+# Configures for the target specified by $1, and invokes make with the dist
+# target using $DIST_DIR as the distribution output directory.
+build_target() {
+  local target="$1"
+  local old_pwd="$(pwd)"
+
+  vlog "***Building target: ${target}***"
+
+  mkdir "${target}"
+  cd "${target}"
+  eval "../../${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+      --disable-docs ${devnull}
+  export DIST_DIR
+  eval make -j ${MAKE_JOBS} dist ${devnull}
+  cd "${old_pwd}"
+
+  vlog "***Done building target: ${target}***"
+}
+
+# Returns the preprocessor symbol for the target specified by $1.
+target_to_preproc_symbol() {
+  target="$1"
+  case "${target}" in
+    armv6-*)
+      echo "__ARM_ARCH_6__"
+      ;;
+    armv7-*)
+      echo "__ARM_ARCH_7__"
+      ;;
+    armv7s-*)
+      echo "__ARM_ARCH_7S__"
+      ;;
+    x86-*)
+      echo "__i386__"
+      ;;
+    x86_64-*)
+      echo "__x86_64__"
+      ;;
+    *)
+      echo "#error ${target} unknown/unsupported"
+      return 1
+      ;;
+  esac
+}
+
+# Create a vpx_config.h shim that, based on preprocessor settings for the
+# current target CPU, includes the real vpx_config.h for the current target.
+# $1 is the list of targets.
+create_vpx_framework_config_shim() {
+  local targets="$1"
+  local config_file="${HEADER_DIR}/vpx_config.h"
+  local preproc_symbol=""
+  local target=""
+  local include_guard="VPX_FRAMEWORK_HEADERS_VPX_VPX_CONFIG_H_"
+
+  local file_header="/*
+ *  Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* GENERATED FILE: DO NOT EDIT! */
+
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#if defined"
+
+  printf "%s" "${file_header}" > "${config_file}"
+  for target in ${targets}; do
+    preproc_symbol=$(target_to_preproc_symbol "${target}")
+    printf " ${preproc_symbol}\n" >> "${config_file}"
+    printf "#include \"VPX/vpx/${target}/vpx_config.h\"\n" >> "${config_file}"
+    printf "#elif defined" >> "${config_file}"
+    mkdir "${HEADER_DIR}/${target}"
+    cp -p "${BUILD_ROOT}/${target}/vpx_config.h" "${HEADER_DIR}/${target}"
+  done
+
+  # Consume the last line of output from the loop: We don't want it.
+  sed -i '' -e '$d' "${config_file}"
+
+  printf "#endif\n\n" >> "${config_file}"
+  printf "#endif  // ${include_guard}" >> "${config_file}"
+}
+
+# Configures and builds each target specified by $1, and then builds
+# VPX.framework.
+build_framework() {
+  local lib_list=""
+  local targets="$1"
+  local target=""
+  local target_dist_dir=""
+
+  # Clean up from previous build(s).
+  rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}"
+
+  # Create output dirs.
+  mkdir -p "${BUILD_ROOT}"
+  mkdir -p "${HEADER_DIR}"
+
+  cd "${BUILD_ROOT}"
+
+  for target in ${targets}; do
+    build_target "${target}"
+    target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
+    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.a"
+  done
+
+  cd "${ORIG_PWD}"
+
+  # The basic libvpx API includes are all the same; just grab the most recent
+  # set.
+  cp -p "${target_dist_dir}"/include/vpx/* "${HEADER_DIR}"
+
+  # Build the fat library.
+  ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/VPX
+
+  # Create the vpx_config.h shim that allows usage of vpx_config.h from
+  # within VPX.framework.
+  create_vpx_framework_config_shim "${targets}"
+
+  # Copy in vpx_version.h.
+  cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"
+
+  vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
+  for lib in ${lib_list}; do
+    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
+  done
+
+  # TODO(tomfinegan): Verify that expected targets are included within
+  # VPX.framework/VPX via lipo -info.
+}
+
+# Trap function. Cleans up the subtree used to build all targets contained in
+# $TARGETS.
+cleanup() {
+  cd "${ORIG_PWD}"
+
+  if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
+    rm -rf "${BUILD_ROOT}"
+  fi
+}
+
+iosbuild_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --help: Display this message and exit.
+    --jobs: Number of make jobs.
+    --preserve-build-output: Do not delete the build directory.
+    --show-build-output: Show output from each library build.
+    --verbose: Output information about the environment and each stage of the
+               build.
+EOF
+}
+
+vlog() {
+  if [ "${VERBOSE}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+trap cleanup EXIT
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --help)
+      iosbuild_usage
+      exit
+      ;;
+    --jobs)
+      MAKE_JOBS="$2"
+      shift
+      ;;
+    --preserve-build-output)
+      PRESERVE_BUILD_OUTPUT=yes
+      ;;
+    --show-build-output)
+      devnull=
+      ;;
+    --verbose)
+      VERBOSE=yes
+      ;;
+    *)
+      iosbuild_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [ "${VERBOSE}" = "yes" ]; then
+cat << EOF
+  BUILD_ROOT=${BUILD_ROOT}
+  DIST_DIR=${DIST_DIR}
+  FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  HEADER_DIR=${HEADER_DIR}
+  MAKE_JOBS=${MAKE_JOBS}
+  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
+  LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
+  LIPO=${LIPO}
+  ORIG_PWD=${ORIG_PWD}
+  TARGETS="${TARGETS}"
+EOF
+fi
+
+build_framework "${TARGETS}"
--- a/10
+++ b/10
@@ -103,6 +103,7 @@ all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-vs11"
 all_platforms="${all_platforms} armv7-win32-vs12"
+all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -271,7 +272,14 @@ EXPERIMENT_LIST="
    alpha
    multiple_arf
    spatial_svc
-    transcode
+    denoising
+    masked_interinter
+    interintra
+    masked_interintra
+    filterintra
+    ext_tx
+    supertx
+    copy_coding
 "
 CONFIG_LIST="
    external_build
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -296,6 +296,7 @@ int main(int argc, const char **argv) {
  int frame_duration = 1; /* 1 timebase tick per frame */
  FILE *infile = NULL;
  int end_of_stream = 0;
+  int frame_size;

  memset(&svc_ctx, 0, sizeof(svc_ctx));
  svc_ctx.log_print = 1;
@@ -351,11 +352,10 @@ int main(int argc, const char **argv) {
      die_codec(&codec, "Failed to encode frame");
    }
    if (!(app_input.passes == 2 && app_input.pass == 1)) {
-      if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
+      while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
        vpx_video_writer_write_frame(writer,
                                     vpx_svc_get_buffer(&svc_ctx),
-                                     vpx_svc_get_frame_size(&svc_ctx),
-                                     pts);
+                                     frame_size, pts);
      }
    }
    if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -645,6 +645,26 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
 #endif

 #if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h);
+void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h);
+}
+
 const ConvolveFunctions convolve8_avx2(
    vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
    vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
@@ -655,8 +675,10 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
    make_tuple(8, 4, &convolve8_avx2),
    make_tuple(4, 8, &convolve8_avx2),
    make_tuple(8, 8, &convolve8_avx2),
+    make_tuple(8, 16, &convolve8_avx2)));
+
+INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
    make_tuple(16, 8, &convolve8_avx2),
-    make_tuple(8, 16, &convolve8_avx2),
    make_tuple(16, 16, &convolve8_avx2),
    make_tuple(32, 16, &convolve8_avx2),
    make_tuple(16, 32, &convolve8_avx2),
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -606,4 +606,29 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::Values(
        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
 #endif
+
+#if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride);
+void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, int stride,
+                       int tx_type);
+}
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct16x16_avx2,
+                   &vp9_idct16x16_256_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 3)));
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 0),
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 1),
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 2)));
+#endif
 }  // namespace
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -15,13 +15,27 @@

 namespace libvpx_test {

+const char kVP8Name[] = "WebM Project VP8";
+
+vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+                                    vpx_codec_stream_info_t *stream_info) {
+  return vpx_codec_peek_stream_info(CodecInterface(),
+                                    cxdata, static_cast<unsigned int>(size),
+                                    stream_info);
+}
+
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+  return DecodeFrame(cxdata, size, NULL);
+}
+
+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+                                     void *user_priv) {
  vpx_codec_err_t res_dec;
  InitOnce();
  REGISTER_STATE_CHECK(
      res_dec = vpx_codec_decode(&decoder_,
                                 cxdata, static_cast<unsigned int>(size),
-                                 NULL, 0));
+                                 user_priv, 0));
  return res_dec;
 }

@@ -29,13 +43,37 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {
  vpx_codec_dec_cfg_t dec_cfg = {0};
  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
  ASSERT_TRUE(decoder != NULL);
+  const char *codec_name = decoder->GetDecoderName();
+  const bool is_vp8 = strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;

  // Decode frames.
-  for (video->Begin(); video->cxdata(); video->Next()) {
+  for (video->Begin(); !::testing::Test::HasFailure() && video->cxdata();
+       video->Next()) {
    PreDecodeFrameHook(*video, decoder);
+
+    vpx_codec_stream_info_t stream_info;
+    stream_info.sz = sizeof(stream_info);
+    const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
+                                                         video->frame_size(),
+                                                         &stream_info);
+    if (is_vp8) {
+      /* Vp8's implementation of PeekStream returns an error if the frame you
+       * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
+       * frame, which must be a keyframe. */
+      if (video->frame_number() == 0)
+        ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+            << vpx_codec_err_to_string(res_peek);
+    } else {
+      /* The Vp9 implementation of PeekStream returns an error only if the
+       * data passed to it isn't a valid Vp9 chunk. */
+      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+          << vpx_codec_err_to_string(res_peek);
+    }
+
    vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
                                                   video->frame_size());
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (!HandleDecodeResult(res_dec, *video, decoder))
+      break;

    DxDataIterator dec_iter = decoder->GetDxData();
    const vpx_image_t *img = NULL;
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -49,8 +49,14 @@ class Decoder {
    vpx_codec_destroy(&decoder_);
  }

+  vpx_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+                             vpx_codec_stream_info_t *stream_info);
+
  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);

+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+                              void *user_priv);
+
  DxDataIterator GetDxData() {
    return DxDataIterator(&decoder_);
  }
@@ -85,6 +91,10 @@ class Decoder {
        &decoder_, cb_get, cb_release, user_priv);
  }

+  const char* GetDecoderName() {
+    return vpx_codec_iface_name(CodecInterface());
+  }
+
 protected:
  virtual vpx_codec_iface_t* CodecInterface() const = 0;

@@ -114,6 +124,14 @@ class DecoderTest {
  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
                                  Decoder *decoder) {}

+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const CompressedVideoSource& /* video */,
+                                  Decoder *decoder) {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
  // Hook to be called on every decompressed frame.
  virtual void DecompressedFrameHook(const vpx_image_t& img,
                                     const unsigned int frame_number) {}
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -34,7 +34,10 @@ decode_to_md5() {
  local expected_md5="$3"
  local output_file="${VPX_TEST_OUTPUT_DIR}/decode_to_md5_${codec}"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}

--- a/test/decode_with_drops.sh
+++ b/test/decode_with_drops.sh
@@ -34,7 +34,10 @@ decode_with_drops() {
  local output_file="${VPX_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
  local drop_mode="$3"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" "${drop_mode}" ${devnull}

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -376,4 +376,19 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
 #endif

+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_avx2,
+                   &vp9_idct4x4_16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 0),
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 1),
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 2),
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 3)));
+#endif
+
 }  // namespace
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -367,4 +367,18 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::Values(
        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
 #endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_avx2, &vp9_idct8x8_64_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    AVX2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 0),
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 1),
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 2),
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 3)));
+#endif
 }  // namespace
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+class InvalidFileTest
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL)
+      fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name_) {
+    res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
+    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
+        << res_file_name_;
+  }
+
+  virtual bool HandleDecodeResult(
+      const vpx_codec_err_t res_dec,
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    // Check results match.
+    EXPECT_EQ(expected_res_dec, res_dec)
+        << "Results don't match: frame number = " << video.frame_number();
+
+    return !HasFailure();
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) {
+  const std::string filename = GET_PARAM(1);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+#if CONFIG_WEBM_IO
+    video = new libvpx_test::WebMVideoSource(filename);
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  video->Init();
+
+  // Construct result file name. The file holds a list of expected integer
+  // results, one for each decoded frame.  Any result that doesn't match
+  // the files list will cause a test failure.
+  const std::string res_filename = filename + ".res";
+  OpenResFile(res_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+const char *const kVP9InvalidFileTests[] = {
+  "invalid-vp90-01.webm",
+  "invalid-vp90-02.webm",
+  "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf",
+};
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP9InvalidFileTests,
+                                              kVP9InvalidFileTests +
+                                              NELEMENTS(kVP9InvalidFileTests)));
+
+}  // namespace
--- a/test/postproc.sh
+++ b/test/postproc.sh
@@ -32,7 +32,10 @@ postproc() {
  local codec="$2"
  local output_file="${VPX_TEST_OUTPUT_DIR}/postproc_${codec}.raw"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}

--- a/test/resize_util.sh
+++ b/test/resize_util.sh
@@ -33,7 +33,10 @@ resize_util() {

  # resize_util is available only when CONFIG_SHARED is disabled.
  if [ -z "$(vpx_config_option_enabled CONFIG_SHARED)" ]; then
-    [ -x "${resizer}" ] || return 1
+    if [ ! -x "${resizer}" ]; then
+      elog "${resizer} does not exist or is not executable."
+      return 1
+    fi

    eval "${resizer}" "${YUV_RAW_INPUT}" \
        "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -627,4 +627,24 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3

+#if HAVE_AVX2
+#if CONFIG_VP9_ENCODER
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_ptr[], int ref_stride,
+                          unsigned int *sad_array);
+void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_ptr[], int ref_stride,
+                          unsigned int *sad_array);
+}
+const sad_n_by_n_by_4_fn_t sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
+const sad_n_by_n_by_4_fn_t sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
+INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values(
+                        make_tuple(32, 32, sad_32x32x4d_avx2),
+                        make_tuple(64, 64, sad_64x64x4d_avx2)));
+#endif  // CONFIG_VP9_ENCODER
+#endif  // HAVE_AVX2
+
 }  // namespace
--- a/test/simple_decoder.sh
+++ b/test/simple_decoder.sh
@@ -32,7 +32,10 @@ simple_decoder() {
  local codec="$2"
  local output_file="${VPX_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}

--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -29,7 +29,10 @@ simple_encoder() {
  local codec="$1"
  local output_file="${VPX_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf"

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -31,7 +31,6 @@ class SvcTest : public ::testing::Test {
  SvcTest()
      : codec_iface_(0),
        test_file_name_("hantro_collage_w352h288.yuv"),
-        stats_file_name_("hantro_collage_w352h288.stat"),
        codec_initialized_(false),
        decoder_(0) {
    memset(&svc_, 0, sizeof(svc_));
@@ -74,7 +73,6 @@ class SvcTest : public ::testing::Test {
  struct vpx_codec_enc_cfg codec_enc_;
  vpx_codec_iface_t *codec_iface_;
  std::string test_file_name_;
-  std::string stats_file_name_;
  bool codec_initialized_;
  Decoder *decoder_;
 };
@@ -267,9 +265,17 @@ TEST_F(SvcTest, FirstFrameHasLayers) {
                       video.duration(), VPX_DL_GOOD_QUALITY);
  EXPECT_EQ(VPX_CODEC_OK, res);

+  if (vpx_svc_get_frame_size(&svc_) == 0) {
+    // Flush encoder
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+  }
+
+  int frame_size = vpx_svc_get_frame_size(&svc_);
+  EXPECT_GT(frame_size, 0);
  const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);

  // this test fails with a decoder error
  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
@@ -279,6 +285,9 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -293,13 +302,14 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  // This frame is a keyframe.
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));

-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 1
  video.Next();
@@ -307,12 +317,14 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 2
  video.Next();
@@ -320,12 +332,29 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }

 TEST_F(SvcTest, GetLayerResolution) {
@@ -364,7 +393,9 @@ TEST_F(SvcTest, GetLayerResolution) {
  EXPECT_EQ(kHeight * 8 / 16, layer_height);
 }

-TEST_F(SvcTest, FirstPassEncode) {
+TEST_F(SvcTest, TwoPassEncode) {
+  // First pass encode
+  std::string stats_buf;
  svc_.spatial_layers = 2;
  codec_enc_.g_pass = VPX_RC_FIRST_PASS;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
@@ -383,62 +414,61 @@ TEST_F(SvcTest, FirstPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
+  size_t stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+  EXPECT_GT(stats_size, 0U);
+  const char *stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+  ASSERT_TRUE(stats_data != NULL);
+  stats_buf.append(stats_data, stats_size);

  // FRAME 1
  video.Next();
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
+  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+  EXPECT_GT(stats_size, 0U);
+  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+  ASSERT_TRUE(stats_data != NULL);
+  stats_buf.append(stats_data, stats_size);

  // Flush encoder and test EOS packet
  res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
-}
+  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+  EXPECT_GT(stats_size, 0U);
+  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+  ASSERT_TRUE(stats_data != NULL);
+  stats_buf.append(stats_data, stats_size);

-TEST_F(SvcTest, SecondPassEncode) {
-  svc_.spatial_layers = 2;
+  // Tear down encoder
+  vpx_svc_release(&svc_);
+  vpx_codec_destroy(&codec_);
+
+  // Second pass encode
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;
  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
+  codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();

-  FILE *const stats_file = libvpx_test::OpenTestDataFile(stats_file_name_);
-  ASSERT_TRUE(stats_file != NULL) << "Stats file open failed. Filename: "
-      << stats_file;
-
-  struct vpx_fixed_buf stats_buf;
-  fseek(stats_file, 0, SEEK_END);
-  stats_buf.sz = static_cast<size_t>(ftell(stats_file));
-  fseek(stats_file, 0, SEEK_SET);
-
-  stats_buf.buf = malloc(stats_buf.sz);
-  ASSERT_TRUE(stats_buf.buf != NULL);
-  const size_t bytes_read = fread(stats_buf.buf, 1, stats_buf.sz, stats_file);
-  ASSERT_EQ(bytes_read, stats_buf.sz);
-  fclose(stats_file);
-  codec_enc_.rc_twopass_stats_in = stats_buf;
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  ASSERT_EQ(VPX_CODEC_OK, res);
  codec_initialized_ = true;

-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
  // FRAME 0
  video.Begin();
  // This frame is a keyframe.
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));

-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 1
  video.Next();
@@ -446,12 +476,14 @@ TEST_F(SvcTest, SecondPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 2
  video.Next();
@@ -459,14 +491,29 @@ TEST_F(SvcTest, SecondPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

-  free(stats_buf.buf);
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }

 }  // namespace
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -1,6 +1,9 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
-998cec53307c94aa5835aaf8d5731f6a3c7c2e5a  hantro_collage_w352h288.stat
 b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
+fe346136b9b8c1e6f6084cc106485706915795e4  invalid-vp90-01.webm
+25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01.webm.res
+d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02.webm
+2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
@@ -577,6 +580,8 @@ d48c5db1b0f8e60521a7c749696b8067886033a3  vp90-2-09-aq2.webm
 54638c38009198c38c8f3b25c182b709b6c1fd2e  vp90-2-09-lf_deltas.webm.md5
 510d95f3beb3b51c572611fdaeeece12277dac30  vp90-2-10-show-existing-frame.webm
 14d631096f4bfa2d71f7f739aec1448fb3c33bad  vp90-2-10-show-existing-frame.webm.md5
+d2feea7728e8d2c615981d0f47427a4a5a45d881  vp90-2-10-show-existing-frame2.webm
+5f7c7811baa3e4f03be1dd78c33971b727846821  vp90-2-10-show-existing-frame2.webm.md5
 b4318e75f73a6a08992c7326de2fb589c2a794c7  vp90-2-11-size-351x287.webm
 b3c48382cf7d0454e83a02497c229d27720f9e20  vp90-2-11-size-351x287.webm.md5
 8e0096475ea2535bac71d3e2fc09e0c451c444df  vp90-2-11-size-351x288.webm
@@ -639,4 +644,5 @@ e615575ded499ea1d992f3b38e3baa434509cdcd  vp90-2-15-segkey.webm
 e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
 9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
 8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
-
+76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
--- a/test/test.mk
+++ b/test/test.mk
@@ -30,6 +30,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
@@ -54,6 +55,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 endif

+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc

 # Currently we only support decoder perf tests for vp9. Also they read from WebM
@@ -131,7 +133,6 @@ endif # CONFIG_SHARED
 ## TEST DATA
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.stat
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m

@@ -691,6 +692,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
@@ -756,6 +759,14 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5

+# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
+
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -161,6 +161,7 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-08-tile-4x1.webm", "vp90-2-09-subpixel-00.ivf",
  "vp90-2-02-size-lf-1920x1080.webm", "vp90-2-09-aq2.webm",
  "vp90-2-09-lf_deltas.webm", "vp90-2-10-show-existing-frame.webm",
+  "vp90-2-10-show-existing-frame2.webm",
  "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
  "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
  "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
@@ -178,7 +179,7 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-14-resize-fp-tiles-4-2.webm", "vp90-2-14-resize-fp-tiles-4-8.webm",
  "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
-  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm"
+  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -17,6 +17,10 @@ VPX_TEST_TOOLS_COMMON_SH=included
 set -e
 devnull='> /dev/null 2>&1'

+elog() {
+  echo "$@" 1>&2
+}
+
 vlog() {
  if [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
    echo "$@"
@@ -456,10 +460,19 @@ vlog "$(basename "${0%.*}") test configuration:
  LIBVPX_BIN_PATH=${LIBVPX_BIN_PATH}
  LIBVPX_CONFIG_PATH=${LIBVPX_CONFIG_PATH}
  LIBVPX_TEST_DATA_PATH=${LIBVPX_TEST_DATA_PATH}
-  VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}
-  VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
+  VP8_IVF_FILE=${VP8_IVF_FILE}
+  VP9_IVF_FILE=${VP9_IVF_FILE}
+  VP9_WEBM_FILE=${VP9_WEBM_FILE}
+  VPX_TEST_EXE_SUFFIX=${VPX_TEST_EXE_SUFFIX}
  VPX_TEST_FILTER=${VPX_TEST_FILTER}
+  VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}
+  VPX_TEST_RAND=${VPX_TEST_RAND}
  VPX_TEST_RUN_DISABLED_TESTS=${VPX_TEST_RUN_DISABLED_TESTS}
-  VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}"
+  VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}
+  VPX_TEST_TEMP_ROOT=${VPX_TEST_TEMP_ROOT}
+  VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
+  YUV_RAW_INPUT=${YUV_RAW_INPUT}
+  YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
+  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}"

 fi  # End $VPX_TEST_TOOLS_COMMON_SH pseudo include guard.
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -29,7 +29,10 @@ twopass_encoder() {
  local codec="$1"
  local output_file="${VPX_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vp8.h"
+
+namespace {
+
+using std::string;
+using libvpx_test::ACMRandom;
+
+#if CONFIG_WEBM_IO
+
+void CheckUserPrivateData(void *user_priv, int *target) {
+  // actual pointer value should be the same as expected.
+  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv) <<
+      "user_priv pointer value does not match.";
+}
+
+// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
+// compares the user_priv from return img with the original user_priv to see if
+// they match. Both the pointer values and the values inside the addresses
+// should match.
+string DecodeFile(const string &filename) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  int frame_num = 0;
+  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
+       video.Next()) {
+    void *user_priv = reinterpret_cast<void *>(&frame_num);
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size(),
+                            (frame_num == 0) ? NULL : user_priv);
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data.
+    while ((img = dec_iter.Next())) {
+      if (frame_num == 0) {
+        CheckUserPrivateData(img->user_priv, NULL);
+      } else {
+        CheckUserPrivateData(img->user_priv, &frame_num);
+
+        // Also test ctrl_get_reference api.
+        struct vp9_ref_frame ref;
+        // Randomly fetch a reference frame.
+        ref.idx = rnd.Rand8() % 3;
+        decoder.Control(VP9_GET_REFERENCE, &ref);
+
+        CheckUserPrivateData(ref.img.user_priv, &frame_num);
+      }
+      md5.Add(img);
+    }
+
+    frame_num++;
+  }
+  return string(md5.Get());
+}
+
+TEST(UserPrivTest, VideoDecode) {
+  // no tiles or frame parallel; this exercises the decoding to test the
+  // user_priv.
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("vp90-2-03-size-226x226.webm").c_str());
+}
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -702,6 +702,57 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
 #endif
+
+#if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+unsigned int vp9_sub_pixel_variance32x32_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vp9_sub_pixel_variance64x64_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vp9_sub_pixel_avg_variance32x32_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred);
+unsigned int vp9_sub_pixel_avg_variance64x64_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred);
+}
+const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
+const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
+const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
+const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2;
+const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VP9VarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_avx2),
+                      make_tuple(5, 4, variance32x16_avx2),
+                      make_tuple(5, 5, variance32x32_avx2),
+                      make_tuple(6, 5, variance64x32_avx2),
+                      make_tuple(6, 6, variance64x64_avx2)));
+
+const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
+    vp9_sub_pixel_variance32x32_avx2;
+const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
+    vp9_sub_pixel_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
+                      make_tuple(6, 6, subpel_variance64x64_avx2)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
+    vp9_sub_pixel_avg_variance32x32_avx2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
+    vp9_sub_pixel_avg_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
+                      make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
+#endif  // HAVE_AVX2
 #endif  // CONFIG_VP9_ENCODER

 }  // namespace vp9
--- a/test/vp8cx_set_ref.sh
+++ b/test/vp8cx_set_ref.sh
@@ -34,7 +34,10 @@ vpx_set_ref() {
  local output_file="${VPX_TEST_OUTPUT_DIR}/vp8cx_set_ref_${codec}.ivf"
  local ref_frame_num=90

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
      "${YUV_RAW_INPUT}" "${output_file}" "${ref_frame_num}" \
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh
@@ -34,7 +34,10 @@ vp9_spatial_svc_encoder() {

  shift

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" -h "${YUV_RAW_INPUT_HEIGHT}" \
      -k "${max_kf}" -f "${frames_to_encode}" "$@" "${YUV_RAW_INPUT}" \
--- a/test/vpx_temporal_svc_encoder.sh
+++ b/test/vpx_temporal_svc_encoder.sh
@@ -39,7 +39,10 @@ vpx_tsvc_encoder() {

  shift 2

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" "${codec}" \
      "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
--- a/third_party/libmkv/EbmlIDs.h
+++ b/third_party/libmkv/EbmlIDs.h
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef MKV_DEFS_HPP
+#define MKV_DEFS_HPP 1
+
+/* Commenting out values not available in webm, but available in matroska */
+
+enum mkv {
+  EBML = 0x1A45DFA3,
+  EBMLVersion = 0x4286,
+  EBMLReadVersion = 0x42F7,
+  EBMLMaxIDLength = 0x42F2,
+  EBMLMaxSizeLength = 0x42F3,
+  DocType = 0x4282,
+  DocTypeVersion = 0x4287,
+  DocTypeReadVersion = 0x4285,
+/* CRC_32 = 0xBF, */
+  Void = 0xEC,
+  SignatureSlot = 0x1B538667,
+  SignatureAlgo = 0x7E8A,
+  SignatureHash = 0x7E9A,
+  SignaturePublicKey = 0x7EA5,
+  Signature = 0x7EB5,
+  SignatureElements = 0x7E5B,
+  SignatureElementList = 0x7E7B,
+  SignedElement = 0x6532,
+  /* segment */
+  Segment = 0x18538067,
+  /* Meta Seek Information */
+  SeekHead = 0x114D9B74,
+  Seek = 0x4DBB,
+  SeekID = 0x53AB,
+  SeekPosition = 0x53AC,
+  /* Segment Information */
+  Info = 0x1549A966,
+/* SegmentUID = 0x73A4, */
+/* SegmentFilename = 0x7384, */
+/* PrevUID = 0x3CB923, */
+/* PrevFilename = 0x3C83AB, */
+/* NextUID = 0x3EB923, */
+/* NextFilename = 0x3E83BB, */
+/* SegmentFamily = 0x4444, */
+/* ChapterTranslate = 0x6924, */
+/* ChapterTranslateEditionUID = 0x69FC, */
+/* ChapterTranslateCodec = 0x69BF, */
+/* ChapterTranslateID = 0x69A5, */
+  TimecodeScale = 0x2AD7B1,
+  Segment_Duration = 0x4489,
+  DateUTC = 0x4461,
+/* Title = 0x7BA9, */
+  MuxingApp = 0x4D80,
+  WritingApp = 0x5741,
+  /* Cluster */
+  Cluster = 0x1F43B675,
+  Timecode = 0xE7,
+/* SilentTracks = 0x5854, */
+/* SilentTrackNumber = 0x58D7, */
+/* Position = 0xA7, */
+  PrevSize = 0xAB,
+  BlockGroup = 0xA0,
+  Block = 0xA1,
+/* BlockVirtual = 0xA2, */
+  BlockAdditions = 0x75A1,
+  BlockMore = 0xA6,
+  BlockAddID = 0xEE,
+  BlockAdditional = 0xA5,
+  BlockDuration = 0x9B,
+/* ReferencePriority = 0xFA, */
+  ReferenceBlock = 0xFB,
+/* ReferenceVirtual = 0xFD, */
+/* CodecState = 0xA4, */
+/* Slices = 0x8E, */
+/* TimeSlice = 0xE8, */
+  LaceNumber = 0xCC,
+/* FrameNumber = 0xCD, */
+/* BlockAdditionID = 0xCB, */
+/* MkvDelay = 0xCE, */
+/* Cluster_Duration = 0xCF, */
+  SimpleBlock = 0xA3,
+/* EncryptedBlock = 0xAF, */
+  /* Track */
+  Tracks = 0x1654AE6B,
+  TrackEntry = 0xAE,
+  TrackNumber = 0xD7,
+  TrackUID = 0x73C5,
+  TrackType = 0x83,
+  FlagEnabled = 0xB9,
+  FlagDefault = 0x88,
+  FlagForced = 0x55AA,
+  FlagLacing = 0x9C,
+/* MinCache = 0x6DE7, */
+/* MaxCache = 0x6DF8, */
+  DefaultDuration = 0x23E383,
+/* TrackTimecodeScale = 0x23314F, */
+/* TrackOffset = 0x537F, */
+  MaxBlockAdditionID = 0x55EE,
+  Name = 0x536E,
+  Language = 0x22B59C,
+  CodecID = 0x86,
+  CodecPrivate = 0x63A2,
+  CodecName = 0x258688,
+/* AttachmentLink = 0x7446, */
+/* CodecSettings = 0x3A9697, */
+/* CodecInfoURL = 0x3B4040, */
+/* CodecDownloadURL = 0x26B240, */
+/* CodecDecodeAll = 0xAA, */
+/* TrackOverlay = 0x6FAB, */
+/* TrackTranslate = 0x6624, */
+/* TrackTranslateEditionUID = 0x66FC, */
+/* TrackTranslateCodec = 0x66BF, */
+/* TrackTranslateTrackID = 0x66A5, */
+  /* video */
+  Video = 0xE0,
+  FlagInterlaced = 0x9A,
+  StereoMode = 0x53B8,
+  AlphaMode = 0x53C0,
+  PixelWidth = 0xB0,
+  PixelHeight = 0xBA,
+  PixelCropBottom = 0x54AA,
+  PixelCropTop = 0x54BB,
+  PixelCropLeft = 0x54CC,
+  PixelCropRight = 0x54DD,
+  DisplayWidth = 0x54B0,
+  DisplayHeight = 0x54BA,
+  DisplayUnit = 0x54B2,
+  AspectRatioType = 0x54B3,
+/* ColourSpace = 0x2EB524, */
+/* GammaValue = 0x2FB523, */
+  FrameRate = 0x2383E3,
+  /* end video */
+  /* audio */
+  Audio = 0xE1,
+  SamplingFrequency = 0xB5,
+  OutputSamplingFrequency = 0x78B5,
+  Channels = 0x9F,
+/* ChannelPositions = 0x7D7B, */
+  BitDepth = 0x6264,
+  /* end audio */
+  /* content encoding */
+/* ContentEncodings = 0x6d80, */
+/* ContentEncoding = 0x6240, */
+/* ContentEncodingOrder = 0x5031, */
+/* ContentEncodingScope = 0x5032, */
+/* ContentEncodingType = 0x5033, */
+/* ContentCompression = 0x5034, */
+/* ContentCompAlgo = 0x4254, */
+/* ContentCompSettings = 0x4255, */
+/* ContentEncryption = 0x5035, */
+/* ContentEncAlgo = 0x47e1, */
+/* ContentEncKeyID = 0x47e2, */
+/* ContentSignature = 0x47e3, */
+/* ContentSigKeyID = 0x47e4, */
+/* ContentSigAlgo = 0x47e5, */
+/* ContentSigHashAlgo = 0x47e6, */
+  /* end content encoding */
+  /* Cueing Data */
+  Cues = 0x1C53BB6B,
+  CuePoint = 0xBB,
+  CueTime = 0xB3,
+  CueTrackPositions = 0xB7,
+  CueTrack = 0xF7,
+  CueClusterPosition = 0xF1,
+  CueBlockNumber = 0x5378
+/* CueCodecState = 0xEA, */
+/* CueReference = 0xDB, */
+/* CueRefTime = 0x96, */
+/* CueRefCluster = 0x97, */
+/* CueRefNumber = 0x535F, */
+/* CueRefCodecState = 0xEB, */
+  /* Attachment */
+/* Attachments = 0x1941A469, */
+/* AttachedFile = 0x61A7, */
+/* FileDescription = 0x467E, */
+/* FileName = 0x466E, */
+/* FileMimeType = 0x4660, */
+/* FileData = 0x465C, */
+/* FileUID = 0x46AE, */
+/* FileReferral = 0x4675, */
+  /* Chapters */
+/* Chapters = 0x1043A770, */
+/* EditionEntry = 0x45B9, */
+/* EditionUID = 0x45BC, */
+/* EditionFlagHidden = 0x45BD, */
+/* EditionFlagDefault = 0x45DB, */
+/* EditionFlagOrdered = 0x45DD, */
+/* ChapterAtom = 0xB6, */
+/* ChapterUID = 0x73C4, */
+/* ChapterTimeStart = 0x91, */
+/* ChapterTimeEnd = 0x92, */
+/* ChapterFlagHidden = 0x98, */
+/* ChapterFlagEnabled = 0x4598, */
+/* ChapterSegmentUID = 0x6E67, */
+/* ChapterSegmentEditionUID = 0x6EBC, */
+/* ChapterPhysicalEquiv = 0x63C3, */
+/* ChapterTrack = 0x8F, */
+/* ChapterTrackNumber = 0x89, */
+/* ChapterDisplay = 0x80, */
+/* ChapString = 0x85, */
+/* ChapLanguage = 0x437C, */
+/* ChapCountry = 0x437E, */
+/* ChapProcess = 0x6944, */
+/* ChapProcessCodecID = 0x6955, */
+/* ChapProcessPrivate = 0x450D, */
+/* ChapProcessCommand = 0x6911, */
+/* ChapProcessTime = 0x6922, */
+/* ChapProcessData = 0x6933, */
+  /* Tagging */
+/* Tags = 0x1254C367, */
+/* Tag = 0x7373, */
+/* Targets = 0x63C0, */
+/* TargetTypeValue = 0x68CA, */
+/* TargetType = 0x63CA, */
+/* Tagging_TrackUID = 0x63C5, */
+/* Tagging_EditionUID = 0x63C9, */
+/* Tagging_ChapterUID = 0x63C4, */
+/* AttachmentUID = 0x63C6, */
+/* SimpleTag = 0x67C8, */
+/* TagName = 0x45A3, */
+/* TagLanguage = 0x447A, */
+/* TagDefault = 0x4484, */
+/* TagString = 0x4487, */
+/* TagBinary = 0x4485, */
+};
+#endif
--- a/third_party/libmkv/EbmlWriter.c
+++ b/third_party/libmkv/EbmlWriter.c
@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "EbmlWriter.h"
+#include <stdlib.h>
+#include <wchar.h>
+#include <string.h>
+#include <limits.h>
+#if defined(_MSC_VER)
+#define LITERALU64(n) n
+#else
+#define LITERALU64(n) n##LLU
+#endif
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
+  /* TODO check and make sure we are not > than 0x0100000000000000LLU */
+  unsigned char size = 8; /* size in bytes to output */
+
+  /* mask to compare for byte size */
+  int64_t minVal = 0xff;
+
+  for (size = 1; size < 8; size ++) {
+    if (val < minVal)
+      break;
+
+    minVal = (minVal << 7);
+  }
+
+  val |= (((uint64_t)0x80) << ((size - 1) * 7));
+
+  Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
+}
+
+void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
+  const size_t size_ = strlen(str);
+  const uint64_t  size = size_;
+  Ebml_WriteLen(glob, size);
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we omit the null terminator.
+   */
+  Ebml_Write(glob, str, (unsigned long)size);
+}
+
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
+  const size_t strlen = wcslen(wstr);
+
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we include it.
+   */
+  const uint64_t  size = strlen;
+
+  Ebml_WriteLen(glob, size);
+  Ebml_Write(glob, wstr, (unsigned long)size);
+}
+
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
+  int len;
+
+  if (class_id >= 0x01000000)
+    len = 4;
+  else if (class_id >= 0x00010000)
+    len = 3;
+  else if (class_id >= 0x00000100)
+    len = 2;
+  else
+    len = 1;
+
+  Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
+}
+
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
+  unsigned char sizeSerialized = 8 | 0x80;
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), 8);
+}
+
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
+  unsigned char size = 8; /* size in bytes to output */
+  unsigned char sizeSerialized = 0;
+  unsigned long minVal;
+
+  Ebml_WriteID(glob, class_id);
+  minVal = 0x7fLU; /* mask to compare for byte size */
+
+  for (size = 1; size < 4; size ++) {
+    if (ui < minVal) {
+      break;
+    }
+
+    minVal <<= 7;
+  }
+
+  sizeSerialized = 0x80 | size;
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), size);
+}
+/* TODO: perhaps this is a poor name for this id serializer helper function */
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
+  int size;
+  for (size = 4; size > 1; size--) {
+    if (bin & (unsigned int)0x000000ff << ((size - 1) * 8))
+      break;
+  }
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, size);
+  Ebml_WriteID(glob, bin);
+}
+
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) {
+  unsigned char len = 0x88;
+
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &len, sizeof(len), 1);
+  Ebml_Serialize(glob,  &d, sizeof(d), 8);
+}
+
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val) {
+  signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
+  Ebml_Serialize(glob, &out, sizeof(out), 3);
+}
+
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteString(glob, s);
+}
+
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) {
+  Ebml_WriteID(glob,  class_id);
+  Ebml_WriteUTF8(glob,  s);
+}
+
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, data_length);
+  Ebml_Write(glob,  data, data_length);
+}
+
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
+  unsigned char tmp = 0;
+  unsigned long i = 0;
+
+  Ebml_WriteID(glob, 0xEC);
+  Ebml_WriteLen(glob, vSize);
+
+  for (i = 0; i < vSize; i++) {
+    Ebml_Write(glob, &tmp, 1);
+  }
+}
+
+/* TODO Serialize Date */
--- a/third_party/libmkv/EbmlWriter.h
+++ b/third_party/libmkv/EbmlWriter.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef EBMLWRITER_HPP
+#define EBMLWRITER_HPP
+#include <stddef.h>
+#include "vpx/vpx_integer.h"
+
+/* note: you must define write and serialize functions as well as your own
+ * EBML_GLOBAL
+ *
+ * These functions MUST be implemented
+ */
+
+typedef struct EbmlGlobal EbmlGlobal;
+void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
+void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
+
+/*****/
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
+void Ebml_WriteString(EbmlGlobal *glob, const char *str);
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui);
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
+/* TODO make this more generic to signed */
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
+/* TODO need date function */
+#endif
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -463,9 +463,7 @@ $vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2/;
-# TODO(johann) Update sse4 implementation and re-enable
-#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1/;

 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -246,7 +246,6 @@ sym(vp8_mbpost_proc_down_mmx):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
 global sym(vp8_plane_add_noise_mmx) PRIVATE
 sym(vp8_plane_add_noise_mmx):
    push        rbp
@@ -258,7 +257,7 @@ sym(vp8_plane_add_noise_mmx):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -660,7 +660,6 @@ sym(vp8_mbpost_proc_across_ip_xmm):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
 global sym(vp8_plane_add_noise_wmt) PRIVATE
 sym(vp8_plane_add_noise_wmt):
    push        rbp
@@ -672,7 +671,7 @@ sym(vp8_plane_add_noise_wmt):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp8/common/x86/postproc_x86.c
+++ b/vp8/common/x86/postproc_x86.c
@@ -1,24 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
-#if defined(__ANDROID__)
-#define rand __rand
-#include <stdlib.h>
-#undef rand
-
-extern int rand(void)
-{
-  return __rand();
-}
-#else
-/* ISO C forbids an empty translation unit. */
-int vp8_unused;
-#endif
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -191,10 +191,12 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
    return FILTER_BLOCK;
 }

-int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
+                          int num_mb_rows, int num_mb_cols)
 {
    int i;
    assert(denoiser);
+    denoiser->num_mb_cols = num_mb_cols;

    for (i = 0; i < MAX_REF_FRAMES; i++)
    {
@@ -222,6 +224,10 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)

    vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
               denoiser->yv12_mc_running_avg.frame_size);
+
+    denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
+    vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
+
    return 0;
 }

@@ -243,13 +249,20 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                             unsigned int best_sse,
                             unsigned int zero_mv_sse,
                             int recon_yoffset,
-                             int recon_uvoffset)
+                             int recon_uvoffset,
+                             loop_filter_info_n *lfi_n,
+                             int mb_row,
+                             int mb_col,
+                             int block_index)
 {
    int mv_row;
    int mv_col;
    unsigned int motion_magnitude2;
    unsigned int sse_thresh;
    int sse_diff_thresh = 0;
+    // Spatial loop filter: only applied selectively based on
+    // temporal filter state of block relative to top/left neighbors.
+    int apply_spatial_loop_filter = 1;
    MV_REFERENCE_FRAME frame = x->best_reference_frame;
    MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;

@@ -263,7 +276,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
        MB_MODE_INFO saved_mbmi;
        MACROBLOCKD *filter_xd = &x->e_mbd;
        MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
-        int sse_diff = zero_mv_sse - best_sse;
+        int sse_diff = 0;
+        // Bias on zero motion vector sse.
+        int zero_bias = 95;
+        zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
+        sse_diff = zero_mv_sse - best_sse;

        saved_mbmi = *mbmi;

@@ -362,6 +379,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                                         running_avg_y, avg_y_stride,
                                         x->thismb, 16, motion_magnitude2,
                                         x->increase_denoising);
+        denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
+            kFilterNonZeroMV : kFilterZeroMV;
    }
    if (decision == COPY_BLOCK)
    {
@@ -372,5 +391,59 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                x->thismb, 16,
                denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
                denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
+        denoiser->denoise_state[block_index] = kNoFilter;
+    }
+    // Option to selectively deblock the denoised signal.
+    if (apply_spatial_loop_filter) {
+      loop_filter_info lfi;
+      int apply_filter_col = 0;
+      int apply_filter_row = 0;
+      int apply_filter = 0;
+      int y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
+      int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
+
+      // Fix filter level to some nominal value for now.
+      int filter_level = 32;
+
+      int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
+      lfi.mblim = lfi_n->mblim[filter_level];
+      lfi.blim = lfi_n->blim[filter_level];
+      lfi.lim = lfi_n->lim[filter_level];
+      lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+      // Apply filter if there is a difference in the denoiser filter state
+      // between the current and left/top block, or if non-zero motion vector
+      // is used for the motion-compensated filtering.
+      if (mb_col > 0) {
+        apply_filter_col = !((denoiser->denoise_state[block_index] ==
+            denoiser->denoise_state[block_index - 1]) &&
+            denoiser->denoise_state[block_index] != kFilterNonZeroMV);
+        if (apply_filter_col) {
+          // Filter left vertical edge.
+          apply_filter = 1;
+          vp8_loop_filter_mbv(
+              denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+              NULL, NULL, y_stride, uv_stride, &lfi);
+        }
+      }
+      if (mb_row > 0) {
+        apply_filter_row = !((denoiser->denoise_state[block_index] ==
+            denoiser->denoise_state[block_index - denoiser->num_mb_cols]) &&
+            denoiser->denoise_state[block_index] != kFilterNonZeroMV);
+        if (apply_filter_row) {
+          // Filter top horizontal edge.
+          apply_filter = 1;
+          vp8_loop_filter_mbh(
+              denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+              NULL, NULL, y_stride, uv_stride, &lfi);
+        }
+      }
+      if (apply_filter) {
+        // Update the signal block |x|. Pixel changes are only to top and/or
+        // left boundary pixels: can we avoid full block copy here.
+        vp8_copy_mem16x16(
+            denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+            y_stride, x->thismb, 16);
+      }
    }
 }
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -12,6 +12,7 @@
 #define VP8_ENCODER_DENOISING_H_

 #include "block.h"
+#include "vp8/common/loopfilter.h"

 #ifdef __cplusplus
 extern "C" {
@@ -27,13 +28,22 @@ enum vp8_denoiser_decision
  FILTER_BLOCK
 };

+enum vp8_denoiser_filter_state {
+  kNoFilter,
+  kFilterZeroMV,
+  kFilterNonZeroMV
+};
+
 typedef struct vp8_denoiser
 {
    YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
    YV12_BUFFER_CONFIG yv12_mc_running_avg;
+    unsigned char* denoise_state;
+    int num_mb_cols;
 } VP8_DENOISER;

-int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
+                          int num_mb_rows, int num_mb_cols);

 void vp8_denoiser_free(VP8_DENOISER *denoiser);

@@ -42,7 +52,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                             unsigned int best_sse,
                             unsigned int zero_mv_sse,
                             int recon_yoffset,
-                             int recon_uvoffset);
+                             int recon_uvoffset,
+                             loop_filter_info_n *lfi_n,
+                             int mb_row,
+                             int mb_col,
+                             int block_index);

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1246,7 +1246,7 @@ int vp8cx_encode_inter_macroblock
            x->zbin_mode_boost_enabled = 0;
        }
        vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                               &distortion, &intra_error);
+                               &distortion, &intra_error, mb_row, mb_col);

        /* switch back to the regular quantizer for the encode */
        if (cpi->sf.improved_quant)
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -98,6 +98,9 @@ extern double vp8_calc_ssimg
 #ifdef OUTPUT_YUV_SRC
 FILE *yuv_file;
 #endif
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file;
+#endif

 #if 0
 FILE *framepsnr;
@@ -1748,7 +1751,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
      {
        int width = (cpi->oxcf.Width + 15) & ~15;
        int height = (cpi->oxcf.Height + 15) & ~15;
-        vp8_denoiser_allocate(&cpi->denoiser, width, height);
+        vp8_denoiser_allocate(&cpi->denoiser, width, height,
+                              cpi->common.mb_rows, cpi->common.mb_cols);
      }
    }
 #endif
@@ -1961,6 +1965,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 #ifdef OUTPUT_YUV_SRC
    yuv_file = fopen("bd.yuv", "ab");
 #endif
+#ifdef OUTPUT_YUV_DENOISED
+    yuv_denoised_file = fopen("denoised.yuv", "ab");
+#endif

 #if 0
    framepsnr = fopen("framepsnr.stt", "a");
@@ -2410,6 +2417,9 @@ void vp8_remove_compressor(VP8_COMP **ptr)
 #ifdef OUTPUT_YUV_SRC
    fclose(yuv_file);
 #endif
+#ifdef OUTPUT_YUV_DENOISED
+    fclose(yuv_denoised_file);
+#endif

 #if 0

@@ -2610,7 +2620,7 @@ int vp8_update_entropy(VP8_COMP *cpi, int update)
 }


-#if OUTPUT_YUV_SRC
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
 void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s)
 {
    unsigned char *src = s->y_buffer;
@@ -4430,6 +4440,11 @@ static void encode_frame_to_data_rate

    update_reference_frames(cpi);

+#ifdef OUTPUT_YUV_DENOISED
+    vp8_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
+#endif
+
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
    if (cpi->oxcf.error_resilient_mode)
    {
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -1168,6 +1168,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity)
    {
+        int block_index = mb_row * cpi->common.mb_cols + mb_col;
        if (x->best_sse_inter_mode == DC_PRED)
        {
            /* No best MV found. */
@@ -1179,7 +1180,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        }
        x->increase_denoising = 0;
        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
-                                recon_yoffset, recon_uvoffset);
+                                recon_yoffset, recon_uvoffset,
+                                &cpi->common.lf_info, mb_row, mb_col,
+                                block_index);


        /* Reevaluate ZEROMV after denoising. */
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1935,7 +1935,8 @@ static void update_best_mode(BEST_MODE* best_mode, int this_rd,

 void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                            int recon_uvoffset, int *returnrate,
-                            int *returndistortion, int *returnintra)
+                            int *returndistortion, int *returnintra,
+                            int mb_row, int mb_col)
 {
    BLOCK *b = &x->block[0];
    BLOCKD *d = &x->e_mbd.block[0];
@@ -2510,6 +2511,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity)
    {
+        int block_index = mb_row * cpi->common.mb_cols + mb_col;
        if (x->best_sse_inter_mode == DC_PRED)
        {
            /* No best MV found. */
@@ -2520,7 +2522,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            best_sse = best_rd_sse;
        }
        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
-                                recon_yoffset, recon_uvoffset);
+                                recon_yoffset, recon_uvoffset,
+                                &cpi->common.lf_info, mb_row, mb_col,
+                                block_index);


        /* Reevaluate ZEROMV after denoising. */
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -70,7 +70,10 @@ static void insertsortsad(int arr[],int idx[], int len)
 }

 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
-extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
+                                   int recon_yoffset, int recon_uvoffset,
+                                   int *returnrate, int *returndistortion,
+                                   int *returnintra, int mb_row, int mb_col);
 extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);


--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -26,11 +26,10 @@
        int cmp = (x[z] < boost) | (y[z] == 0); \
        zbin_boost_ptr++; \
        if (cmp) \
-            goto select_eob_end_##i; \
+            break; \
        qcoeff_ptr[z] = y[z]; \
        eob = i; \
        zbin_boost_ptr = b->zrun_zbin_boost; \
-        select_eob_end_##i:; \
    } while (0)

 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -1,256 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse4) PRIVATE
-sym(vp8_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_zbin]
-    mov         rdx, [rdi + vp8_block_round]
-    movd        xmm7, [rdi + vp8_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp8_block_quant_shift]
-    mov         rcx, [rdi + vp8_block_quant]
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp8_blockd_dequant]
-    mov         rdi, [rsi + vp8_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp8/common/entropy.c: vp8_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vp8_rtcd.h"
+#include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#define SELECT_EOB(i, z, x, y, q) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        short x_z = _mm_extract_epi16(x, z); \
+        short y_z = _mm_extract_epi16(y, z); \
+        int cmp = (x_z < boost) | (y_z == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            break; \
+        q = _mm_insert_epi16(q, y_z, z); \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1,
+            dqcoeff0, dqcoeff1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+    __m128i qcoeff0 = _mm_setzero_si128();
+    __m128i qcoeff1 = _mm_setzero_si128();
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+
+    _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
+    _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
+
+    dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0);
+    dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
+
+    *d->eob = eob;
+}
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -107,7 +107,6 @@ VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm

 ifeq ($(CONFIG_POSTPROC),yes)
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,6 +89,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c

 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
@@ -97,7 +98,6 @@ endif
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -25,12 +25,14 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;

-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_c(src, src_stride,
+                    dst, dst_stride,
+                    filter_x, x_step_q4,
+                    filter_y, y_step_q4,
+                    w, h);
+    return;
+  }

  /* Filter starting 3 lines back. The neon implementation will ignore the
   * given height and filter a multiple of 4 lines. Since this goes in to
@@ -57,12 +59,14 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
  int intermediate_height = h + 7;

-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_avg_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_avg_c(src, src_stride,
+                        dst, dst_stride,
+                        filter_x, x_step_q4,
+                        filter_y, y_step_q4,
+                        w, h);
+    return;
+  }

  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -9,6 +9,7 @@
 */

 #include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"

 void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                    const uint8_t *blimit0,
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -109,7 +109,9 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
  }

  vp9_free_frame_buffer(&cm->post_proc_buffer);
+}

+void vp9_free_context_buffers(VP9_COMMON *cm) {
  free_mi(cm);

  vpx_free(cm->last_frame_seg_map);
@@ -165,37 +167,55 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {

 fail:
  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
  return 1;
 }

+static void init_frame_bufs(VP9_COMMON *cm) {
+  int i;
+
+  cm->new_fb_idx = FRAME_BUFFERS - 1;
+  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
+
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = i;
+    cm->frame_bufs[i].ref_count = 1;
+  }
+}
+
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  int i;
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;
-  int i;

  vp9_free_frame_buffers(cm);

-  for (i = 0; i < FRAME_BUFFERS; i++) {
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
    cm->frame_bufs[i].ref_count = 0;
    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
      goto fail;
  }

-  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
-  for (i = 0; i < REF_FRAMES; i++) {
-    cm->ref_frame_map[i] = i;
-    cm->frame_bufs[i].ref_count = 1;
-  }
+  init_frame_bufs(cm);

  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9_ENC_BORDER_IN_PIXELS) < 0)
    goto fail;

+  return 0;
+
+ fail:
+  vp9_free_frame_buffers(cm);
+  return 1;
+}
+
+int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
+  vp9_free_context_buffers(cm);
+
  set_mb_mi(cm, aligned_width, aligned_height);

  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
@@ -224,12 +244,13 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
  return 0;

 fail:
-  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
  return 1;
 }

 void vp9_remove_common(VP9_COMMON *cm) {
  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
 }

--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -23,8 +23,12 @@ void vp9_remove_common(struct VP9Common *cm);
 int vp9_resize_frame_buffers(struct VP9Common *cm, int width, int height);

 int vp9_alloc_frame_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);

 void vp9_free_frame_buffers(struct VP9Common *cm);
+void vp9_free_state_buffers(struct VP9Common *cm);
+void vp9_free_context_buffers(struct VP9Common *cm);

 void vp9_update_frame_size(struct VP9Common *cm);

--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -32,6 +32,9 @@ extern "C" {
 #define BLOCK_SIZE_GROUPS 4
 #define SKIP_CONTEXTS 3
 #define INTER_MODE_CONTEXTS 7
+#if CONFIG_COPY_CODING
+#define COPY_MODE_CONTEXTS 5
+#endif

 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
@@ -79,6 +82,16 @@ typedef enum {
  MB_MODE_COUNT
 } PREDICTION_MODE;

+#if CONFIG_COPY_CODING
+typedef enum {
+  NOREF,
+  REF0,
+  REF1,
+  REF2,
+  COPY_MODE_COUNT
+} COPY_MODE;
+#endif
+
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
  return mode >= NEARESTMV && mode <= NEWMV;
 }
@@ -118,11 +131,86 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
  return mi_width_log2_lookup[sb_type];
 }

+#if CONFIG_SUPERTX
+static INLINE TX_SIZE bsize_to_tx_size(BLOCK_SIZE bsize) {
+  const TX_SIZE tx_size_lookup[BLOCK_SIZES] = {
+    TX_4X4, TX_4X4, TX_4X4,
+    TX_8X8, TX_8X8, TX_8X8,
+    TX_16X16, TX_16X16, TX_16X16,
+    TX_32X32, TX_32X32, TX_32X32, TX_32X32};
+  return tx_size_lookup[bsize];
+}
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+#define MASK_BITS_SML   3
+#define MASK_BITS_MED   4
+#define MASK_BITS_BIG   5
+#define MASK_NONE      -1
+
+static inline int get_mask_bits(BLOCK_SIZE sb_type) {
+  if (sb_type < BLOCK_8X8)
+    return 0;
+  if (sb_type <= BLOCK_8X8)
+    return MASK_BITS_SML;
+  else if (sb_type <= BLOCK_32X32)
+    return MASK_BITS_MED;
+  else
+    return MASK_BITS_BIG;
+}
+#endif
+
+#if CONFIG_INTERINTRA
+static INLINE TX_SIZE intra_size_log2_for_interintra(int bs) {
+  switch (bs) {
+    case 4:
+      return TX_4X4;
+      break;
+    case 8:
+      return TX_8X8;
+      break;
+    case 16:
+      return TX_16X16;
+      break;
+    case 32:
+      return TX_32X32;
+      break;
+    default:
+      return TX_32X32;
+      break;
+  }
+}
+
+static INLINE int is_interintra_allowed(BLOCK_SIZE sb_type) {
+  return ((sb_type >= BLOCK_8X8) && (sb_type < BLOCK_64X64));
+}
+
+#if CONFIG_MASKED_INTERINTRA
+#define MASK_BITS_SML_INTERINTRA   3
+#define MASK_BITS_MED_INTERINTRA   4
+#define MASK_BITS_BIG_INTERINTRA   5
+#define MASK_NONE_INTERINTRA      -1
+static INLINE int get_mask_bits_interintra(BLOCK_SIZE sb_type) {
+  if (sb_type == BLOCK_4X4)
+     return 0;
+  if (sb_type <= BLOCK_8X8)
+    return MASK_BITS_SML_INTERINTRA;
+  else if (sb_type <= BLOCK_32X32)
+    return MASK_BITS_MED_INTERINTRA;
+  else
+    return MASK_BITS_BIG_INTERINTRA;
+}
+#endif
+#endif
+
 // This structure now relates to 8x8 block regions.
 typedef struct {
  // Common for both INTER and INTRA blocks
  BLOCK_SIZE sb_type;
  PREDICTION_MODE mode;
+#if CONFIG_FILTERINTRA
+  int filterbit, uv_filterbit;
+#endif
  TX_SIZE tx_size;
  uint8_t skip;
  uint8_t segment_id;
@@ -137,10 +225,34 @@ typedef struct {
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  uint8_t mode_context[MAX_REF_FRAMES];
  INTERP_FILTER interp_filter;
+
+#if CONFIG_EXT_TX
+  EXT_TX_TYPE ext_txfrm;
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+  int use_masked_interinter;
+  int mask_index;
+#endif
+#if CONFIG_INTERINTRA
+  PREDICTION_MODE interintra_mode, interintra_uv_mode;
+#if CONFIG_MASKED_INTERINTRA
+  int interintra_mask_index;
+  int interintra_uv_mask_index;
+  int use_masked_interintra;
+#endif
+#endif
+#if CONFIG_COPY_CODING
+  COPY_MODE copy_mode;
+  int inter_ref_count;
+#endif
 } MB_MODE_INFO;

 typedef struct {
  MB_MODE_INFO mbmi;
+#if CONFIG_FILTERINTRA
+  int b_filter_info[4];
+#endif
  b_mode_info bmi[4];
 } MODE_INFO;

@@ -149,6 +261,16 @@ static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
                                      : mi->mbmi.mode;
 }

+#if CONFIG_FILTERINTRA
+static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
+  return 1;
+}
+
+static INLINE int is_filter_enabled(TX_SIZE txsize) {
+  return (txsize <= TX_32X32);
+}
+#endif
+
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
  return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@@ -240,6 +362,13 @@ typedef struct macroblockd {
  PARTITION_CONTEXT left_seg_context[8];
 } MACROBLOCKD;

+#if CONFIG_SUPERTX
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  return mbmi->tx_size >
+         MIN(b_width_log2(mbmi->sb_type), b_height_log2(mbmi->sb_type));
+}
+#endif
+
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                     PARTITION_TYPE partition) {
  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
@@ -253,8 +382,20 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
                                  const MACROBLOCKD *xd) {
  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;

+#if !CONFIG_EXT_TX
  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
    return DCT_DCT;
+#else
+  if (plane_type != PLANE_TYPE_Y)
+      return DCT_DCT;
+
+  if (is_inter_block(mbmi)) {
+    if (mbmi->ext_txfrm == NORM || mbmi->tx_size >= TX_32X32)
+      return DCT_DCT;
+    else
+      return ADST_ADST;
+  }
+#endif
  return intra_mode_to_tx_type_lookup[mbmi->mode];
 }

@@ -262,8 +403,20 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd, int ib) {
  const MODE_INFO *const mi = xd->mi[0];

+#if !CONFIG_EXT_TX
  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
    return DCT_DCT;
+#else
+  if (plane_type != PLANE_TYPE_Y || xd->lossless)
+      return DCT_DCT;
+
+  if (is_inter_block(&mi->mbmi)) {
+    if (mi->mbmi.ext_txfrm == NORM)
+      return DCT_DCT;
+    else
+      return ADST_ADST;
+  }
+#endif

  return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
 }
@@ -281,7 +434,15 @@ static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
 }

 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+#if CONFIG_SUPERTX
+  if (!supertx_enabled(mbmi)) {
+#endif
  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type);
+#if CONFIG_SUPERTX
+  } else {
+    return uvsupertx_size_lookup[mbmi->tx_size];
+  }
+#endif
 }

 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -133,6 +133,15 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };

+#if CONFIG_SUPERTX
+const TX_SIZE uvsupertx_size_lookup[TX_SIZES] = {
+    TX_4X4,
+    TX_4X4,
+    TX_8X8,
+    TX_16X16
+};
+#endif
+
 // Generates 4 bit field in which each bit set to 1 represents
 // a blocksize partition  1111 means we split 64x64, 32x32, 16x16
 // and 8x8.  1000 means we just split the 64x64 to 32x32
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -31,6 +31,9 @@ extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_SUPERTX
+extern const TX_SIZE uvsupertx_size_lookup[TX_SIZES];
+#endif

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -117,17 +117,25 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
                     const InterpKernel *const y_filters,
                     int y0_q4, int y_step_q4,
                     int w, int h) {
-  // Fixed size intermediate buffer places limits on parameters.
-  // Maximum intermediate_height is 324, for y_step_q4 == 80,
-  // h == 64, taps == 8.
-  // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
-  uint8_t temp[64 * 324];
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint8_t temp[135 * 64];
  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;

  assert(w <= 64);
  assert(h <= 64);
-  assert(y_step_q4 <= 80);
-  assert(x_step_q4 <= 80);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);

  if (intermediate_height < h)
    intermediate_height = h;
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -13,6 +13,84 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"

+#if CONFIG_MASKED_INTERINTER
+static const vp9_prob default_masked_interinter_prob[BLOCK_SIZES] = {
+    192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#endif
+
+#if CONFIG_INTERINTRA
+static const vp9_prob default_interintra_prob[BLOCK_SIZES] = {
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#if CONFIG_MASKED_INTERINTRA
+static const vp9_prob default_masked_interintra_prob[BLOCK_SIZES] = {
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#endif
+#endif
+
+#if CONFIG_FILTERINTRA
+static const vp9_prob default_filterintra_prob[TX_SIZES][INTRA_MODES] = {
+  // DC     V      H    D45   D135   D117   D153   D207    D63     TM
+  {153,   171,   147,   150,   129,   101,   100,   153,   132,   111},
+  {171,   173,   185,   131,    70,    53,    70,   148,   127,   114},
+  {175,   203,   213,    86,    45,    71,    41,   150,   125,   154},
+  {235,   230,   154,   202,   154,   205,    37,   128,     0,   202}
+};
+#endif
+
+#if CONFIG_EXT_TX
+static const vp9_prob default_ext_tx_prob = 178;
+#endif
+
+#if CONFIG_SUPERTX
+static const vp9_prob default_supertx_prob[TX_SIZES] = {
+  255, 160, 160, 160
+};
+
+static const vp9_prob default_supertxsplit_prob[TX_SIZES] = {
+  255, 200, 200, 200
+};
+#endif
+
+#if CONFIG_COPY_CODING
+static const vp9_prob default_copy_noref_prob[COPY_MODE_CONTEXTS]
+                                             [BLOCK_SIZES] = {
+  {255, 255, 255,  82, 148, 182,  65, 193, 158,  70, 138, 101,  23},
+  {255, 255, 255, 118, 153, 161, 123, 169, 157,  82, 101, 123,  88},
+  {255, 255, 255, 130, 178, 226, 194, 196, 174, 173, 135, 144, 141},
+  {255, 255, 255, 178, 218, 225, 197, 230, 222, 215, 220, 220, 220},
+  {255, 255, 255, 243, 248, 241, 233, 249, 249, 249, 249, 249, 249}
+};
+
+static const vp9_prob default_copy_mode_probs_l2[COPY_MODE_CONTEXTS][1] = {
+  {207},
+  {135},
+  {141},
+  {189},
+  {209}
+};
+
+const vp9_tree_index vp9_copy_mode_tree_l2[TREE_SIZE(2)] = {
+  -(REF0 - REF0), -(REF1 - REF0)
+};
+
+static const vp9_prob default_copy_mode_probs[COPY_MODE_CONTEXTS]
+                                             [COPY_MODE_COUNT - 2] = {
+  {130, 159},
+  {126, 176},
+  {120, 150},
+  {158, 183},
+  {149, 125}
+};
+
+const vp9_tree_index vp9_copy_mode_tree[TREE_SIZE(COPY_MODE_COUNT - 1)] = {
+  -(REF0 - REF0),  2,
+  -(REF1 - REF0),  -(REF2 - REF0)
+};
+#endif
+
 const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
  {  // above = dc
    { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
@@ -245,7 +323,11 @@ const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
 };

 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+#if !CONFIG_COPY_CODING
  9, 102, 187, 225
+#else
+  35, 112, 187, 225
+#endif
 };

 static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
@@ -326,6 +408,30 @@ void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
  fc->tx_probs = default_tx_probs;
  vp9_copy(fc->skip_probs, default_skip_probs);
  vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
+#if CONFIG_MASKED_INTERINTER
+  vp9_copy(fc->masked_interinter_prob, default_masked_interinter_prob);
+#endif
+#if CONFIG_INTERINTRA
+  vp9_copy(fc->interintra_prob, default_interintra_prob);
+#if CONFIG_MASKED_INTERINTRA
+  vp9_copy(fc->masked_interintra_prob, default_masked_interintra_prob);
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  vp9_copy(fc->filterintra_prob, default_filterintra_prob);
+#endif
+#if CONFIG_EXT_TX
+  fc->ext_tx_prob = default_ext_tx_prob;
+#endif
+#if CONFIG_SUPERTX
+  vp9_copy(fc->supertx_prob, default_supertx_prob);
+  vp9_copy(fc->supertxsplit_prob, default_supertxsplit_prob);
+#endif
+#if CONFIG_COPY_CODING
+  vp9_copy(fc->copy_noref_prob, default_copy_noref_prob);
+  vp9_copy(fc->copy_mode_probs_l2, default_copy_mode_probs_l2);
+  vp9_copy(fc->copy_mode_probs, default_copy_mode_probs);
+#endif
 }

 const vp9_tree_index vp9_switchable_interp_tree
@@ -416,6 +522,73 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {

  for (i = 0; i < SKIP_CONTEXTS; ++i)
    fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
+
+#if CONFIG_MASKED_INTERINTER
+  if (cm->use_masked_interinter) {
+    for (i = 0; i < BLOCK_SIZES; ++i) {
+      if (get_mask_bits(i))
+        fc->masked_interinter_prob[i] = adapt_prob
+                                      (pre_fc->masked_interinter_prob[i],
+                                       counts->masked_interinter[i]);
+    }
+  }
+#endif
+
+#if CONFIG_INTERINTRA
+  if (cm->use_interintra) {
+    for (i = 0; i < BLOCK_SIZES; ++i) {
+      if (is_interintra_allowed(i))
+        fc->interintra_prob[i] = adapt_prob(pre_fc->interintra_prob[i],
+                                            counts->interintra[i]);
+    }
+#if CONFIG_MASKED_INTERINTRA
+    if (cm->use_masked_interintra) {
+      for (i = 0; i < BLOCK_SIZES; ++i) {
+        if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
+          fc->masked_interintra_prob[i] = adapt_prob(
+                                          pre_fc->masked_interintra_prob[i],
+                                          counts->masked_interintra[i]);
+      }
+    }
+#endif
+  }
+#endif
+
+#if CONFIG_FILTERINTRA
+  for (i = 0; i < TX_SIZES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      fc->filterintra_prob[i][j] = adapt_prob(pre_fc->filterintra_prob[i][j],
+                                   counts->filterintra[i][j]);
+#endif
+
+#if CONFIG_EXT_TX
+  fc->ext_tx_prob = adapt_prob(pre_fc->ext_tx_prob, counts->ext_tx);
+#endif
+
+#if CONFIG_SUPERTX
+  for (i = 1; i < TX_SIZES; ++i) {
+    fc->supertx_prob[i] = adapt_prob(pre_fc->supertx_prob[i],
+                                     counts->supertx[i]);
+  }
+
+  for (i = 1; i < TX_SIZES; ++i) {
+    fc->supertxsplit_prob[i] = adapt_prob(pre_fc->supertxsplit_prob[i],
+                                          counts->supertxsplit[i]);
+  }
+#endif
+
+#if CONFIG_COPY_CODING
+  for (i = 0; i < COPY_MODE_CONTEXTS; i++) {
+    for (j = BLOCK_8X8; j < BLOCK_SIZES; j++) {
+      fc->copy_noref_prob[i][j] =
+          adapt_prob(pre_fc->copy_noref_prob[i][j], counts->copy_noref[i][j]);
+    }
+    adapt_probs(vp9_copy_mode_tree_l2, pre_fc->copy_mode_probs_l2[i],
+                counts->copy_mode_l2[i], fc->copy_mode_probs_l2[i]);
+    adapt_probs(vp9_copy_mode_tree, pre_fc->copy_mode_probs[i],
+                counts->copy_mode[i], fc->copy_mode_probs[i]);
+  }
+#endif
 }

 static void set_default_lf_deltas(struct loopfilter *lf) {
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -52,6 +52,30 @@ typedef struct frame_contexts {
  struct tx_probs tx_probs;
  vp9_prob skip_probs[SKIP_CONTEXTS];
  nmv_context nmvc;
+#if CONFIG_MASKED_INTERINTER
+  vp9_prob masked_interinter_prob[BLOCK_SIZES];
+#endif
+#if CONFIG_INTERINTRA
+  vp9_prob interintra_prob[BLOCK_SIZES];
+#if CONFIG_MASKED_INTERINTRA
+  vp9_prob masked_interintra_prob[BLOCK_SIZES];
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  vp9_prob filterintra_prob[TX_SIZES][INTRA_MODES];
+#endif
+#if CONFIG_EXT_TX
+  vp9_prob ext_tx_prob;
+#endif
+#if CONFIG_SUPERTX
+  vp9_prob supertx_prob[TX_SIZES];
+  vp9_prob supertxsplit_prob[TX_SIZES];
+#endif
+#if CONFIG_COPY_CODING
+  vp9_prob copy_noref_prob[COPY_MODE_CONTEXTS][BLOCK_SIZES];
+  vp9_prob copy_mode_probs_l2[COPY_MODE_CONTEXTS][1];
+  vp9_prob copy_mode_probs[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 2];
+#endif
 } FRAME_CONTEXT;

 typedef struct {
@@ -71,6 +95,31 @@ typedef struct {
  struct tx_counts tx;
  unsigned int skip[SKIP_CONTEXTS][2];
  nmv_context_counts mv;
+#if CONFIG_MASKED_INTERINTER
+  unsigned int masked_interinter[BLOCK_SIZES][2];
+#endif
+#if CONFIG_INTERINTRA
+  unsigned int interintra[BLOCK_SIZES][2];
+#if CONFIG_MASKED_INTERINTRA
+  unsigned int masked_interintra[BLOCK_SIZES][2];
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  unsigned int filterintra[TX_SIZES][INTRA_MODES][2];
+#endif
+#if CONFIG_EXT_TX
+  unsigned int ext_tx[2];
+#endif
+#if CONFIG_SUPERTX
+  unsigned int supertx[TX_SIZES][2];
+  unsigned int supertxsplit[TX_SIZES][2];
+  unsigned int supertx_size[BLOCK_SIZES];
+#endif
+#if CONFIG_COPY_CODING
+  unsigned int copy_noref[COPY_MODE_CONTEXTS][BLOCK_SIZES][2];
+  unsigned int copy_mode_l2[COPY_MODE_CONTEXTS][2];
+  unsigned int copy_mode[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 1];
+#endif
 } FRAME_COUNTS;

 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
@@ -83,6 +132,10 @@ extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern const vp9_tree_index vp9_switchable_interp_tree
                                [TREE_SIZE(SWITCHABLE_FILTERS)];
+#if CONFIG_COPY_CODING
+extern const vp9_tree_index vp9_copy_mode_tree_l2[TREE_SIZE(2)];
+extern const vp9_tree_index vp9_copy_mode_tree[TREE_SIZE(COPY_MODE_COUNT - 1)];
+#endif

 void vp9_setup_past_independence(struct VP9Common *cm);

--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -100,6 +100,14 @@ typedef enum {
  TX_TYPES = 4
 } TX_TYPE;

+#if CONFIG_EXT_TX
+typedef enum {
+  NORM   = 0,
+  ALT = 1,
+  EXT_TX_TYPES = 2
+} EXT_TX_TYPE;
+#endif
+
 typedef enum {
  UNKNOWN    = 0,
  BT_601     = 1,  // YUV
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -206,6 +206,13 @@ static const int mode_lf_lut[MB_MODE_COUNT] = {
  1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
 };

+#if CONFIG_SUPERTX
+static int supertx_enabled_lpf(const MB_MODE_INFO *mbmi) {
+  return mbmi->tx_size >
+         MIN(b_width_log2(mbmi->sb_type), b_height_log2(mbmi->sb_type));
+}
+#endif
+
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
  int lvl;

@@ -572,6 +579,85 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
 }

+#if CONFIG_SUPERTX
+static void build_masks_supertx(const loop_filter_info_n *const lfi_n,
+                                const MODE_INFO *mi, const int shift_y,
+                                const int shift_uv,
+                                LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
+  const BLOCK_SIZE block_size = 3 * (int)tx_size_y;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+  int i;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16,   we'll set :
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and v set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+
+  // Here we are adding a mask for the transform size.  The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+
+  if (tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+}
+#endif
+
 // This function does the same thing as the one above with the exception that
 // it only affects the y masks.   It exists because for blocks < 16x16 in size,
 // we only update u and v masks on the first block.
@@ -615,6 +701,48 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
 }

+#if CONFIG_SUPERTX
+static void build_y_mask_supertx(const loop_filter_info_n *const lfi_n,
+                                 const MODE_INFO *mi, const int shift_y,
+                                 LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const BLOCK_SIZE block_size = 3 * (int)tx_size_y;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  int i;
+
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+}
+#endif
+
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
@@ -650,6 +778,9 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+#if CONFIG_SUPERTX
+  int supertx;
+#endif

  vp9_zero(*lfm);

@@ -687,20 +818,43 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            break;
          case BLOCK_32X16:
+#if CONFIG_SUPERTX
+            supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+            if (!supertx) {
+#endif
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            if (mi_32_row_offset + 2 >= max_rows)
              continue;
            mip2 = mip + mode_info_stride * 2;
            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+#if CONFIG_SUPERTX
+            } else {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            }
+#endif
            break;
          case BLOCK_16X32:
+#if CONFIG_SUPERTX
+            supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+            if (!supertx) {
+#endif
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            if (mi_32_col_offset + 2 >= max_cols)
              continue;
            mip2 = mip + 2;
            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+#if CONFIG_SUPERTX
+            } else {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            }
+#endif
            break;
          default:
+#if CONFIG_SUPERTX
+            if (mip[0]->mbmi.tx_size == TX_32X32) {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            } else {
+#endif
            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
@@ -717,24 +871,56 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  break;
                case BLOCK_16X8:
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  if (mi_16_row_offset + 1 >= max_rows)
                    continue;
                  mip2 = mip + mode_info_stride;
                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  break;
                case BLOCK_8X16:
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  if (mi_16_col_offset +1 >= max_cols)
                    continue;
                  mip2 = mip + 1;
                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  break;
                default: {
+#if CONFIG_SUPERTX
+                  if (mip[0]->mbmi.tx_size == TX_16X16) {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  } else {
+#endif
                  const int shift_y = shift_32_y[idx_32] +
                                      shift_16_y[idx_16] +
                                      shift_8_y[0];
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  mip += offset[0];
                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
                    const int shift_y = shift_32_y[idx_32] +
@@ -748,12 +934,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                    if (mi_8_col_offset >= max_cols ||
                        mi_8_row_offset >= max_rows)
                      continue;
+#if CONFIG_SUPERTX
+                    supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                    if (!supertx)
+#endif
                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+#if CONFIG_SUPERTX
+                    else
+                      build_y_mask_supertx(lfi_n, mip[0], shift_y, lfm);
+#endif
                  }
+#if CONFIG_SUPERTX
+                  }
+#endif
                  break;
                }
              }
            }
+#if CONFIG_SUPERTX
+            }
+#endif
            break;
        }
      }
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,181 +11,6 @@

 #include "vp9/common/vp9_mvref_common.h"

-#define MVREF_NEIGHBOURS 8
-
-typedef struct position {
-  int row;
-  int col;
-} POSITION;
-
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[MB_MODE_COUNT] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D207_PRED
-  9,  // D63_PRED
-  9,  // TM_PRED
-  0,  // NEARESTMV
-  0,  // NEARMV
-  3,  // ZEROMV
-  1,  // NEWMV
-};
-
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,  // 0
-  NEW_PLUS_NON_INTRA,  // 1
-  BOTH_NEW,  // 2
-  ZERO_PLUS_PREDICTED,  // 3
-  NEW_PLUS_NON_INTRA,  // 4
-  INVALID_CASE,  // 5
-  BOTH_ZERO,  // 6
-  INVALID_CASE,  // 7
-  INVALID_CASE,  // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,  // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,  // 13
-  INVALID_CASE,  // 14
-  INVALID_CASE,  // 15
-  INVALID_CASE,  // 16
-  INVALID_CASE,  // 17
-  BOTH_INTRA  // 18
-};
-
-static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
-  // 4X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 4X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // 16X8
-  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // 16X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 16X32
-  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X16
-  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X32
-  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X64
-  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // 64X32
-  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
-};
-
-static const int idx_n_column_to_subblock[4][2] = {
-  {1, 2},
-  {1, 3},
-  {3, 2},
-  {3, 3}
-};
-
-// clamp_mv_ref
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-
-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
-                                      int search_col, int block_idx) {
-  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
-          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
-              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv];
-}
-
-
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
-  }
-  return mv;
-}
-
-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
-  do { \
-    if (refmv_count) { \
-      if ((mv).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (mv); \
-        goto Done; \
-      } \
-    } else { \
-      mv_ref_list[refmv_count++] = (mv); \
-    } \
-  } while (0)
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
-  do { \
-    if (is_inter_block(mbmi)) { \
-      if ((mbmi)->ref_frame[0] != ref_frame) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
-      if (has_second_ref(mbmi) && \
-          (mbmi)->ref_frame[1] != ref_frame && \
-          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
-    } \
-  } while (0)
-
-
-// Checks that the given mi_row, mi_col and search point
-// are inside the borders of the tile.
-static INLINE int is_inside(const TileInfo *const tile,
-                            int mi_col, int mi_row, int mi_rows,
-                            const POSITION *mi_pos) {
-  return !(mi_row + mi_pos->row < 0 ||
-           mi_col + mi_pos->col < tile->mi_col_start ||
-           mi_row + mi_pos->row >= mi_rows ||
-           mi_col + mi_pos->col >= tile->mi_col_end);
-}
-
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
@@ -363,3 +188,176 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
      assert("Invalid block index.");
  }
 }
+
+#if CONFIG_COPY_CODING
+static int compare_interinfo(MB_MODE_INFO *mbmi, MB_MODE_INFO *ref_mbmi) {
+  if (mbmi == ref_mbmi) {
+    return 1;
+  } else {
+    int is_same;
+#if CONFIG_INTERINTRA
+    MV_REFERENCE_FRAME mbmi_ref1_backup = mbmi->ref_frame[1];
+    MV_REFERENCE_FRAME refmbmi_ref1_backup = ref_mbmi->ref_frame[1];
+
+    if (mbmi->ref_frame[1] == INTRA_FRAME)
+      mbmi->ref_frame[1] = NONE;
+    if (ref_mbmi->ref_frame[1] == INTRA_FRAME)
+      ref_mbmi->ref_frame[1] = NONE;
+#endif
+    if (mbmi->ref_frame[0] == ref_mbmi->ref_frame[0] &&
+        mbmi->ref_frame[1] == ref_mbmi->ref_frame[1]) {
+      if (mbmi->ref_frame[1] > INTRA_FRAME)
+        is_same = mbmi->mv[0].as_int == ref_mbmi->mv[0].as_int &&
+                  mbmi->mv[1].as_int == ref_mbmi->mv[1].as_int &&
+                  mbmi->interp_filter == ref_mbmi->interp_filter;
+      else
+        is_same = mbmi->mv[0].as_int == ref_mbmi->mv[0].as_int &&
+                  mbmi->interp_filter == ref_mbmi->interp_filter;
+    } else {
+      is_same = 0;
+    }
+#if CONFIG_INTERINTRA
+    mbmi->ref_frame[1] = mbmi_ref1_backup;
+    ref_mbmi->ref_frame[1] = refmbmi_ref1_backup;
+#endif
+
+    return is_same;
+  }
+}
+
+static int check_inside(VP9_COMMON *cm, int mi_row, int mi_col) {
+  return mi_row >= 0 && mi_col >= 0 &&
+         mi_row < cm->mi_rows && mi_col < cm->mi_cols;
+}
+
+static int is_right_available(BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int depth, max_depth = 4 - MIN(b_width_log2(bsize), b_height_log2(bsize));
+  int block[4] = {0};
+
+  if (bsize == BLOCK_64X64)
+    return 1;
+  mi_row = mi_row % 8;
+  mi_col = mi_col % 8;
+  for (depth = 1; depth <= max_depth; depth++) {
+    block[depth] = (mi_row >> (3 - depth)) * 2 + (mi_col >> (3 - depth));
+    mi_row = mi_row % (8 >> depth);
+    mi_col = mi_col % (8 >> depth);
+  }
+
+  if (b_width_log2(bsize) < b_height_log2(bsize)) {
+    if (block[max_depth] == 0)
+      return 1;
+  } else if (b_width_log2(bsize) > b_height_log2(bsize)) {
+    if (block[max_depth] > 0)
+      return 0;
+  } else {
+    if (block[max_depth] == 0 || block[max_depth] == 2)
+      return 1;
+    else if (block[max_depth] == 3)
+      return 0;
+  }
+
+  for (depth = max_depth - 1; depth > 0; depth--) {
+    if (block[depth] == 0 || block[depth] == 2)
+      return 1;
+    else if (block[depth] == 3)
+      return 0;
+  }
+  return 1;
+}
+
+static int is_second_rec(int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  int bw = 4 << b_width_log2(bsize);
+  int bh = 4 << b_height_log2(bsize);
+
+  if (bw < bh)
+    return (mi_col << 3) % (bw << 1) == 0 ? 0 : 1;
+  else if (bh < bw)
+    return (mi_row << 3) % (bh << 1) == 0 ? 0 : 2;
+  else
+    return 0;
+}
+
+int vp9_construct_ref_inter_list(VP9_COMMON *cm,  MACROBLOCKD *xd,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MB_MODE_INFO *ref_list[18]) {
+  int bw = 4 << b_width_log2(bsize);
+  int bh = 4 << b_height_log2(bsize);
+  int row_offset, col_offset;
+  int mi_offset;
+  MB_MODE_INFO *ref_mbmi;
+  int ref_index, ref_num = 0;
+  int row_offset_cand[18], col_offset_cand[18];
+  int offset_num = 0, i, switchflag;
+  int is_sec_rec = is_second_rec(mi_row, mi_col, bsize);
+
+  if (is_sec_rec != 2) {
+    row_offset_cand[offset_num] = -1; col_offset_cand[offset_num] = 0;
+    offset_num++;
+  }
+  if (is_sec_rec != 1) {
+    row_offset_cand[offset_num] = bh / 16; col_offset_cand[offset_num] = -1;
+    offset_num++;
+  }
+
+  row_offset = bh / 8 - 1;
+  col_offset = 1;
+  if (is_sec_rec < 2)
+    switchflag = 1;
+  else
+    switchflag = 0;
+  while ((is_sec_rec == 0 && ((row_offset >=0) || col_offset < (bw / 8 + 1))) ||
+         (is_sec_rec == 1 && col_offset < (bw / 8 + 1)) ||
+         (is_sec_rec == 2 && row_offset >=0)) {
+    switch (switchflag) {
+      case 0:
+        if (row_offset >= 0) {
+          if (row_offset != bh / 16) {
+            row_offset_cand[offset_num] = row_offset;
+            col_offset_cand[offset_num] = -1;
+            offset_num++;
+          }
+          row_offset--;
+        }
+        break;
+      case 1:
+        if (col_offset < (bw / 8 + 1)) {
+          row_offset_cand[offset_num] = -1;
+          col_offset_cand[offset_num] = col_offset;
+          offset_num++;
+          col_offset++;
+        }
+        break;
+      default:
+        assert(0);
+    }
+    if (is_sec_rec == 0)
+      switchflag = 1 - switchflag;
+  }
+  row_offset_cand[offset_num] = -1;
+  col_offset_cand[offset_num] = -1;
+  offset_num++;
+
+  for (i = 0; i < offset_num; i++) {
+    row_offset = row_offset_cand[i];
+    col_offset = col_offset_cand[i];
+    if ((col_offset < (bw / 8) ||
+        (col_offset == (bw / 8) && is_right_available(bsize, mi_row, mi_col)))
+        && check_inside(cm, mi_row + row_offset, mi_col + col_offset)) {
+      mi_offset = row_offset * cm->mi_stride + col_offset;
+      ref_mbmi = &xd->mi[mi_offset]->mbmi;
+      if (is_inter_block(ref_mbmi)) {
+        for (ref_index = 0; ref_index < ref_num; ref_index++) {
+          if (compare_interinfo(ref_mbmi, ref_list[ref_index]))
+            break;
+        }
+        if (ref_index == ref_num) {
+          ref_list[ref_num] = ref_mbmi;
+          ref_num++;
+        }
+      }
+    }
+  }
+  return ref_num;
+}
+#endif
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -21,6 +21,181 @@ extern "C" {
 #define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
                                VP9_INTERP_EXTEND) << 3)

+#define MVREF_NEIGHBOURS 8
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+};
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv];
+}
+
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(mv) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (mv); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_list[refmv_count++] = (mv); \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+    } \
+  } while (0)
+
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
@@ -45,6 +220,12 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   int block, int ref, int mi_row, int mi_col,
                                   int_mv *nearest, int_mv *near);

+#if CONFIG_COPY_CODING
+int vp9_construct_ref_inter_list(VP9_COMMON *cm,  MACROBLOCKD *xd,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MB_MODE_INFO *ref_list[18]);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -213,8 +213,14 @@ typedef struct VP9Common {
  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;

-#if CONFIG_TRANSCODE
-  FILE *mi_array_pf;
+#if CONFIG_MASKED_INTERINTER
+  int use_masked_interinter;
+#endif
+#if CONFIG_INTERINTRA
+  int use_interintra;
+#if CONFIG_MASKED_INTERINTRA
+  int use_masked_interintra;
+#endif
 #endif
 } VP9_COMMON;

--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -383,3 +383,47 @@ int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
  return segment_id;
 }
+
+#if CONFIG_COPY_CODING
+int vp9_get_copy_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  if (has_above && has_left) {
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {
+      return 4;
+    } else if (above_intra || left_intra) {
+      return 3;
+    } else {
+      const int above_predict = above_mbmi->copy_mode != NOREF;
+      const int left_predict = left_mbmi->copy_mode != NOREF;
+      if (above_predict && left_predict)
+        return 0;
+      else if (above_predict || left_predict)
+        return 1;
+      else
+        return 2;
+    }
+  } else if (has_above || has_left) {
+    const MB_MODE_INFO *const ref_mbmi = has_above ? above_mbmi : left_mbmi;
+    const int ref_intra = !is_inter_block(ref_mbmi);
+
+    if (ref_intra) {
+      return 3;
+    } else {
+     const int ref_predict = ref_mbmi != NOREF;
+      if (ref_predict)
+        return 0;
+      else
+        return 1;
+    }
+  } else {
+    return 0;
+  }
+}
+#endif
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -134,6 +134,10 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
  }
 }

+#if CONFIG_COPY_CODING
+int vp9_get_copy_mode_context(const MACROBLOCKD *xd);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -12,7 +12,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"

-#if 1
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
  4,       8,    8,    9,   10,   11,   12,   12,
  13,     14,   15,   16,   17,   18,   19,   19,
@@ -83,44 +82,6 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = {
  1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };

-void vp9_init_quant_tables(void) { }
-#else
-static int16_t dc_qlookup[QINDEX_RANGE];
-static int16_t ac_qlookup[QINDEX_RANGE];
-
-#define ACDC_MIN 8
-
-// TODO(dkovalev) move to common and reuse
-static double poly3(double a, double b, double c, double d, double x) {
-  return a*x*x*x + b*x*x + c*x + d;
-}
-
-void vp9_init_quant_tables() {
-  int i, val = 4;
-
-  // A "real" q of 1.0 forces lossless mode.
-  // In practice non lossless Q's between 1.0 and 2.0 (represented here by
-  // integer values from 5-7 give poor rd results (lower psnr and often
-  // larger size than the lossless encode. To block out those "not very useful"
-  // values we increment the ac and dc q lookup values by 4 after position 0.
-  ac_qlookup[0] = val;
-  dc_qlookup[0] = val;
-  val += 4;
-
-  for (i = 1; i < QINDEX_RANGE; i++) {
-    const int ac_val = val;
-
-    val = (int)(val * 1.01975);
-    if (val == ac_val)
-      ++val;
-
-    ac_qlookup[i] = (int16_t)ac_val;
-    dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9,
-                                                 0.5, ac_val));
-  }
-}
-#endif
-
 int16_t vp9_dc_quant(int qindex, int delta) {
  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
 }
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -22,8 +22,6 @@ extern "C" {
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8

-void vp9_init_quant_tables();
-
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);

--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -139,9 +139,349 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
  return clamped_mv;
 }

+#if CONFIG_MASKED_INTERINTER
+#define MASK_WEIGHT_BITS 6
+
+static int get_masked_weight(int m) {
+  #define SMOOTHER_LEN  32
+  static const uint8_t smoothfn[2 * SMOOTHER_LEN + 1] = {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  1,  1,  1,
+      1,  1,  2,  2,  3,  4,  5,  6,
+      8,  9, 12, 14, 17, 21, 24, 28,
+      32,
+      36, 40, 43, 47, 50, 52, 55, 56,
+      58, 59, 60, 61, 62, 62, 63, 63,
+      63, 63, 63, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+  };
+  if (m < -SMOOTHER_LEN)
+    return 0;
+  else if (m > SMOOTHER_LEN)
+    return (1 << MASK_WEIGHT_BITS);
+  else
+    return smoothfn[m + SMOOTHER_LEN];
+}
+
+static int get_hard_mask(int m) {
+  return 1 << MASK_WEIGHT_BITS * (m > 0);
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
+// The soft mask is obtained by computing f(x, y) and then calling
+// get_masked_weight(f(x, y)).
+static const int mask_params_sml[1 << MASK_BITS_SML][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+};
+
+static const int mask_params_med_hgtw[1 << MASK_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+};
+
+static const int mask_params_med_hltw[1 << MASK_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+};
+
+static const int mask_params_med_heqw[1 << MASK_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_hgtw[1 << MASK_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+};
+
+static const int mask_params_big_hltw[1 << MASK_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_heqw[1 << MASK_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int *get_mask_params(int mask_index,
+                                  BLOCK_SIZE sb_type,
+                                  int h, int w) {
+  const int *a;
+  const int mask_bits = get_mask_bits(sb_type);
+
+  if (mask_index == MASK_NONE)
+    return NULL;
+
+  if (mask_bits == MASK_BITS_SML) {
+    a = mask_params_sml[mask_index];
+  } else if (mask_bits == MASK_BITS_MED) {
+    if (h > w)
+      a = mask_params_med_hgtw[mask_index];
+    else if (h < w)
+      a = mask_params_med_hltw[mask_index];
+    else
+      a = mask_params_med_heqw[mask_index];
+  } else if (mask_bits == MASK_BITS_BIG) {
+    if (h > w)
+      a = mask_params_big_hgtw[mask_index];
+    else if (h < w)
+      a = mask_params_big_hltw[mask_index];
+    else
+      a = mask_params_big_heqw[mask_index];
+  } else {
+    assert(0);
+  }
+  return a;
+}
+
+void vp9_generate_masked_weight(int mask_index,
+                                BLOCK_SIZE sb_type,
+                                int h, int w,
+                                uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight(m);
+    }
+}
+
+void vp9_generate_hard_mask(int mask_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_hard_mask(m);
+    }
+}
+
+static void build_masked_compound(uint8_t *dst, int dst_stride,
+                                  uint8_t *dst2, int dst2_stride,
+                                  int mask_index, BLOCK_SIZE sb_type,
+                                  int h, int w) {
+  int i, j;
+  uint8_t mask[4096];
+  vp9_generate_masked_weight(mask_index, sb_type, h, w, mask, 64);
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int m = mask[i * 64 + j];
+      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                 dst2[i * dst2_stride + j] *
+                                 ((1 << MASK_WEIGHT_BITS) - m) +
+                                 (1 << (MASK_WEIGHT_BITS - 1))) >>
+                                 MASK_WEIGHT_BITS;
+    }
+}
+
+#if CONFIG_SUPERTX
+void generate_masked_weight_extend(int mask_index, int plane,
+                                   BLOCK_SIZE sb_type, int h, int w,
+                                   int mask_offset_x, int mask_offset_y,
+                                   uint8_t *mask, int stride) {
+  int i, j;
+  int subh = (plane ? 2 : 4) << b_height_log2(sb_type);
+  int subw = (plane ? 2 : 4) << b_width_log2(sb_type);
+  const int *a = get_mask_params(mask_index, sb_type, subh, subw);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * subw) / 4 - mask_offset_x);
+      int y = (i - (a[3] * subh) / 4 - mask_offset_y);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight(m);
+    }
+}
+
+static void build_masked_compound_extend(uint8_t *dst, int dst_stride,
+                                         uint8_t *dst2, int dst2_stride,
+                                         int plane,
+                                         int mask_index, BLOCK_SIZE sb_type,
+                                         int mask_offset_x, int mask_offset_y,
+                                         int h, int w) {
+  int i, j;
+  uint8_t mask[4096];
+  generate_masked_weight_extend(mask_index, plane, sb_type, h, w,
+                                mask_offset_x, mask_offset_y, mask, 64);
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int m = mask[i * 64 + j];
+      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                 dst2[i * dst2_stride + j] *
+                                 ((1 << MASK_WEIGHT_BITS) - m) +
+                                 (1 << (MASK_WEIGHT_BITS - 1))) >>
+                                 MASK_WEIGHT_BITS;
+    }
+}
+#endif
+#endif
+
 static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                   int bw, int bh,
                                   int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                   int mask_offset_x, int mask_offset_y,
+#endif
                                   int mi_x, int mi_y) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0];
@@ -193,8 +533,27 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
    pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
           + (scaled_mv.col >> SUBPEL_BITS);

+#if CONFIG_MASKED_INTERINTER
+    if (ref && get_mask_bits(mi->mbmi.sb_type)
+        && mi->mbmi.use_masked_interinter) {
+      uint8_t tmp_dst[4096];
+      inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
+                     subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
+#if CONFIG_SUPERTX
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+                                   mi->mbmi.mask_index, mi->mbmi.sb_type,
+                                   mask_offset_x, mask_offset_y, h, w);
+#else
+      build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+                            mi->mbmi.mask_index, mi->mbmi.sb_type, h, w);
+#endif
+    } else {
+#endif
    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
                    subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+#if CONFIG_MASKED_INTERINTER
+    }
+#endif
  }
 }

@@ -218,10 +577,18 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
           build_inter_predictors(xd, plane, i++, bw, bh,
-                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+                                  4 * x, 4 * y, 4, 4,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                  0, 0,
+#endif
+                                  mi_x, mi_y);
    } else {
      build_inter_predictors(xd, plane, 0, bw, bh,
-                             0, 0, bw, bh, mi_x, mi_y);
+                             0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                             0, 0,
+#endif
+                             mi_x, mi_y);
    }
  }
 }
@@ -229,23 +596,323 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+#if CONFIG_INTERINTRA
+  if (xd->mi[0]->mbmi.ref_frame[1] == INTRA_FRAME &&
+      is_interintra_allowed(xd->mi[0]->mbmi.sb_type))
+    vp9_build_interintra_predictors_sby(xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, bsize);
+#endif
 }
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize) {
  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                    MAX_MB_PLANE - 1);
+#if CONFIG_INTERINTRA
+  if (xd->mi[0]->mbmi.ref_frame[1] == INTRA_FRAME &&
+      is_interintra_allowed(xd->mi[0]->mbmi.sb_type))
+    vp9_build_interintra_predictors_sbuv(xd, xd->plane[1].dst.buf,
+                                         xd->plane[2].dst.buf,
+                                         xd->plane[1].dst.stride,
+                                         xd->plane[2].dst.stride, bsize);
+#endif
 }
+
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                   BLOCK_SIZE bsize) {
  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
                                    MAX_MB_PLANE - 1);
+#if CONFIG_INTERINTRA
+  if (xd->mi[0]->mbmi.ref_frame[1] == INTRA_FRAME &&
+      is_interintra_allowed(xd->mi[0]->mbmi.sb_type))
+    vp9_build_interintra_predictors(xd, xd->plane[0].dst.buf,
+                                    xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+                                    xd->plane[0].dst.stride,
+                                    xd->plane[1].dst.stride,
+                                    xd->plane[2].dst.stride, bsize);
+#endif
 }

+#if CONFIG_SUPERTX
+static int get_masked_weight_supertx(int m) {
+  #define SMOOTHER_LEN  32
+  static const uint8_t smoothfn[2 * SMOOTHER_LEN + 1] = {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  1,  1,  1,
+      1,  1,  2,  2,  3,  4,  5,  6,
+      8,  9, 12, 14, 17, 21, 24, 28,
+      32,
+      36, 40, 43, 47, 50, 52, 55, 56,
+      58, 59, 60, 61, 62, 62, 63, 63,
+      63, 63, 63, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+  };
+  if (m < -SMOOTHER_LEN)
+    return 0;
+  else if (m > SMOOTHER_LEN)
+    return 64;
+  else
+    return smoothfn[m + SMOOTHER_LEN];
+}
+
+static const uint8_t mask_8[8] = {
+  64, 64, 62, 52, 12,  2,  0,  0
+};
+
+static const uint8_t mask_16[16] = {
+  63, 62, 60, 58, 55, 50, 43, 36, 28, 21, 14, 9, 6, 4, 2, 1
+};
+
+static const uint8_t mask_32[32] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 61, 57, 52, 45, 36,
+  28, 19, 12,  7,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+static void generate_1dmask(int length, uint8_t *mask) {
+  int i;
+  switch (length) {
+    case 8:
+      vpx_memcpy(mask, mask_8, length);
+      break;
+    case 16:
+      vpx_memcpy(mask, mask_16, length);
+      break;
+    case 32:
+      vpx_memcpy(mask, mask_32, length);
+      break;
+    default:
+      assert(0);
+  }
+  if (length > 16) {
+    for (i = 0; i < length; ++i)
+      mask[i] = get_masked_weight_supertx(-1 * (2 * i - length + 1));
+  }
+}
+
+void vp9_build_masked_inter_predictor_complex(uint8_t *dst, int dst_stride,
+                                              uint8_t *dst2, int dst2_stride,
+                                              int plane,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize,
+                                              BLOCK_SIZE top_bsize,
+                                              PARTITION_TYPE partition) {
+  int i, j;
+  uint8_t mask[32];
+  int top_w = 4 << b_width_log2(top_bsize),
+      top_h = 4 << b_height_log2(top_bsize);
+  int w = 4 << b_width_log2(bsize), h = 4 << b_height_log2(bsize);
+  int w_offset = (mi_col - mi_col_ori) << 3,
+      h_offset = (mi_row - mi_row_ori) << 3;
+  int m;
+
+  if (plane > 0) {
+    top_w = top_w >> 1; top_h = top_h >> 1;
+    w = w >> 1; h = h >> 1;
+    w_offset = w_offset >> 1; h_offset = h_offset >> 1;
+  }
+  switch (partition) {
+    case PARTITION_HORZ:
+      generate_1dmask(h, mask + h_offset);
+      vpx_memset(mask, 64, h_offset);
+      vpx_memset(mask + h_offset + h, 0, top_h - h_offset - h);
+      break;
+    case PARTITION_VERT:
+      generate_1dmask(w, mask + w_offset);
+      vpx_memset(mask, 64, w_offset);
+      vpx_memset(mask + w_offset + w, 0, top_w - w_offset - w);
+      break;
+    default:
+      assert(0);
+  }
+  for (i = 0; i < top_h; ++i)
+    for (j = 0; j < top_w; ++j) {
+      m = partition == PARTITION_HORZ ? mask[i] : mask[j];
+      if (m == 64)
+        continue;
+      if (m == 0)
+        dst[i * dst_stride + j] = dst2[i * dst2_stride + j];
+      else
+        dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                  dst2[i * dst2_stride + j] *
+                                  (64 - m) + 32) >> 6;
+    }
+}
+
+#if CONFIG_MASKED_INTERINTER
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
+                                  mask_offset_x, mask_offset_y, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                             mask_offset_x, mask_offset_y, mi_x, mi_y);
+    }
+  }
+}
+#endif
+
+void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                  int mi_row, int mi_col,
+                                                  int mi_row_ori,
+                                                  int mi_col_ori,
+                                                  BLOCK_SIZE top_bsize,
+                                                  PARTITION_TYPE partition) {
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  uint8_t *orig_dst;
+  int orig_dst_stride;
+  int bw = 4 << b_width_log2(top_bsize);
+  int bh = 4 << b_height_log2(top_bsize);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, 32 * 32);
+
+  orig_dst = xd->plane[0].dst.buf;
+  orig_dst_stride = xd->plane[0].dst.stride;
+  build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                         mask_offset_x, mask_offset_y,
+#endif
+                         mi_x, mi_y);
+
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = 32;
+  switch (partition) {
+    case PARTITION_HORZ:
+      build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      break;
+    case PARTITION_VERT:
+      build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      break;
+    case PARTITION_SPLIT:
+      build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf1;
+      xd->plane[0].dst.stride = 32;
+      build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf2;
+      xd->plane[0].dst.stride = 32;
+      build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      break;
+    default:
+      assert(0);
+  }
+
+  if (partition != PARTITION_SPLIT) {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             partition);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  } else {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(tmp_buf1, 32,
+                                             tmp_buf2, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf1, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_HORZ);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  }
+}
+
+void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                   int mi_row, int mi_col,
+#endif
+                                                   int mi_row_ori,
+                                                   int mi_col_ori,
+                                                   BLOCK_SIZE top_bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                           mask_offset_x, mask_offset_y,
+#endif
+                           mi_x, mi_y);
+  }
+}
+#endif
+
 // TODO(jingning): This function serves as a placeholder for decoder prediction
 // using on demand border extension. It should be moved to /decoder/ directory.
 static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                       int bw, int bh,
                                       int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                       int mask_offset_x, int mask_offset_y,
+#endif
                                       int mi_x, int mi_y) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0];
@@ -377,8 +1044,27 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
      }
    }

+#if CONFIG_MASKED_INTERINTER
+    if (ref && get_mask_bits(mi->mbmi.sb_type)
+        && mi->mbmi.use_masked_interinter) {
+      uint8_t tmp_dst[4096];
+      inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
+                     subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
+#if CONFIG_SUPERTX
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+                                   mi->mbmi.mask_index, mi->mbmi.sb_type,
+                                   mask_offset_x, mask_offset_y, h, w);
+#else
+      build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+                            mi->mbmi.mask_index, mi->mbmi.sb_type, h, w);
+#endif
+    } else {
+#endif
    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+#if CONFIG_MASKED_INTERINTER
+    }
+#endif
  }
 }

@@ -401,13 +1087,198 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
          dec_build_inter_predictors(xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
+                                     4 * x, 4 * y, 4, 4,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                     0, 0,
+#endif
+                                     mi_x, mi_y);
    } else {
      dec_build_inter_predictors(xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
+                                 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                 0, 0,
+#endif
+                                 mi_x, mi_y);
+    }
+  }
+#if CONFIG_INTERINTRA
+  if (xd->mi[0]->mbmi.ref_frame[1] == INTRA_FRAME &&
+      is_interintra_allowed(xd->mi[0]->mbmi.sb_type))
+    vp9_build_interintra_predictors(xd, xd->plane[0].dst.buf,
+                                    xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+                                    xd->plane[0].dst.stride,
+                                    xd->plane[1].dst.stride,
+                                    xd->plane[2].dst.stride, bsize);
+#endif
+}
+
+#if CONFIG_SUPERTX
+#if CONFIG_MASKED_INTERINTER
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          dec_build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
+                                     mask_offset_x, mask_offset_y, mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                                 mask_offset_x, mask_offset_y, mi_x, mi_y);
    }
  }
 }
+#endif
+
+void vp9_dec_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                  int mi_row, int mi_col,
+                                                  int mi_row_ori,
+                                                  int mi_col_ori,
+                                                  BLOCK_SIZE top_bsize,
+                                                  PARTITION_TYPE partition) {
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  uint8_t *orig_dst;
+  int orig_dst_stride;
+  int bw = 4 << b_width_log2(top_bsize);
+  int bh = 4 << b_height_log2(top_bsize);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, 32 * 32);
+
+  orig_dst = xd->plane[0].dst.buf;
+  orig_dst_stride = xd->plane[0].dst.stride;
+  dec_build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = 32;
+  switch (partition) {
+    case PARTITION_HORZ:
+      dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      break;
+    case PARTITION_VERT:
+      dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      break;
+    case PARTITION_SPLIT:
+      dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf1;
+      xd->plane[0].dst.stride = 32;
+      dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf2;
+      xd->plane[0].dst.stride = 32;
+      dec_build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      break;
+    default:
+      assert(0);
+  }
+
+  if (partition != PARTITION_SPLIT) {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             partition);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  } else {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(tmp_buf1, 32,
+                                             tmp_buf2, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf1, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_HORZ);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  }
+}
+
+void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                       int mi_row, int mi_col,
+#endif
+                                                       int mi_row_ori,
+                                                       int mi_col_ori,
+                                                       BLOCK_SIZE top_bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                               mask_offset_x, mask_offset_y,
+#endif
+                               mi_x, mi_y);
+  }
+}
+#endif

 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                          const YV12_BUFFER_CONFIG *src,
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -65,6 +65,60 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                          const struct scale_factors *sf);

+#if CONFIG_MASKED_INTERINTER
+void vp9_generate_masked_weight(int mask_index, BLOCK_SIZE sb_type,
+                              int h, int w, uint8_t *mask, int stride);
+void vp9_generate_hard_mask(int mask_index, BLOCK_SIZE sb_type,
+                          int h, int w, uint8_t *mask, int stride);
+#endif
+
+#if CONFIG_SUPERTX
+void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                  int mi_row, int mi_col,
+                                                  int mi_row_ori,
+                                                  int mi_col_ori,
+                                                  BLOCK_SIZE top_bsize,
+                                                  PARTITION_TYPE partition);
+void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                   int mi_row, int mi_col,
+#endif
+                                                   int mi_row_ori,
+                                                   int mi_col_ori,
+                                                   BLOCK_SIZE top_bsize);
+void vp9_build_masked_inter_predictor_complex(uint8_t *dst, int dst_stride,
+                                              uint8_t *dst2, int dst2_stride,
+                                              int plane,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize,
+                                              BLOCK_SIZE top_bsize,
+                                              PARTITION_TYPE partition);
+void vp9_dec_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                      int mi_row, int mi_col,
+                                                      int mi_row_ori,
+                                                      int mi_col_ori,
+                                                      BLOCK_SIZE top_bsize,
+                                                      PARTITION_TYPE p);
+void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                       int mi_row, int mi_col,
+#endif
+                                                       int mi_row_ori,
+                                                       int mi_col_ori,
+                                                       BLOCK_SIZE top_bsize);
+#if CONFIG_MASKED_INTERINTER
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize);
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize);
+#endif
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -444,8 +444,227 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
  }
 }

+#if CONFIG_FILTERINTRA
+static void filter_intra_predictors_4tap(uint8_t *ypred_ptr, int y_stride,
+                                         int bs,
+                                         const uint8_t *yabove_row,
+                                         const uint8_t *yleft_col,
+                                         int mode) {
+  static const int prec_bits = 10;
+  static const int round_val = 511;
+
+  int k, r, c;
+  int pred[33][33];
+  int mean, ipred;
+
+  int taps4_4[10][4] = {
+      {735, 881, -537, -54},
+      {1005, 519, -488, -11},
+      {383, 990, -343, -6},
+      {442, 805, -542, 319},
+      {658, 616, -133, -116},
+      {875, 442, -141, -151},
+      {386, 741, -23, -80},
+      {390, 1027, -446, 51},
+      {679, 606, -523, 262},
+      {903, 922, -778, -23}
+  };
+  int taps4_8[10][4] = {
+      {648, 803, -444, 16},
+      {972, 620, -576, 7},
+      {561, 967, -499, -5},
+      {585, 762, -468, 144},
+      {596, 619, -182, -9},
+      {895, 459, -176, -153},
+      {557, 722, -126, -129},
+      {601, 839, -523, 105},
+      {562, 709, -499, 251},
+      {803, 872, -695, 43}
+  };
+  int taps4_16[10][4] = {
+      {423, 728, -347, 111},
+      {963, 685, -665, 23},
+      {281, 1024, -480, 216},
+      {640, 596, -437, 78},
+      {429, 669, -259, 99},
+      {740, 646, -415, 23},
+      {568, 771, -346, 40},
+      {404, 833, -486, 209},
+      {398, 712, -423, 307},
+      {939, 935, -887, 17}
+  };
+  int taps4_32[10][4] = {
+      {477, 737, -393, 150},
+      {881, 630, -546, 67},
+      {506, 984, -443, -20},
+      {114, 459, -270, 528},
+      {433, 528, 14, 3},
+      {837, 470, -301, -30},
+      {181, 777, 89, -107},
+      {-29, 716, -232, 259},
+      {589, 646, -495, 255},
+      {740, 884, -728, 77}
+  };
+
+  const int c1 = (bs >= 32) ? taps4_32[mode][0] : ((bs >= 16) ?
+      taps4_16[mode][0] : ((bs >= 8) ? taps4_8[mode][0] : taps4_4[mode][0]));
+  const int c2 = (bs >= 32) ? taps4_32[mode][1] : ((bs >= 16) ?
+      taps4_16[mode][1] : ((bs >= 8) ? taps4_8[mode][1] : taps4_4[mode][1]));
+  const int c3 = (bs >= 32) ? taps4_32[mode][2] : ((bs >= 16) ?
+      taps4_16[mode][2] : ((bs >= 8) ? taps4_8[mode][2] : taps4_4[mode][2]));
+  const int c4 = (bs >= 32) ? taps4_32[mode][3] : ((bs >= 16) ?
+      taps4_16[mode][3] : ((bs >= 8) ? taps4_8[mode][3] : taps4_4[mode][3]));
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)yleft_col[k];
+    mean = mean + (int)yabove_row[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; r++)
+    pred[r + 1][0] = (int)yleft_col[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; c++)
+    pred[0][c] = (int)yabove_row[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; r++)
+    for (c = 1; c < 2 * bs + 1 - r; c++) {
+      ipred = c1 * pred[r - 1][c] + c2 * pred[r][c - 1]
+                    + c3 * pred[r - 1][c - 1] + c4 * pred[r - 1][c + 1];
+      pred[r][c] = ipred < 0 ? -((-ipred + round_val) >> prec_bits) :
+                               ((ipred + round_val) >> prec_bits);
+    }
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++) {
+      ipred = pred[r + 1][c + 1] + mean;
+      ypred_ptr[c] = clip_pixel(ipred);
+    }
+    ypred_ptr += y_stride;
+  }
+}
+
+static void build_filter_intra_predictors(const MACROBLOCKD *xd,
+                                          const uint8_t *ref, int ref_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          PREDICTION_MODE mode, TX_SIZE tx_size,
+                                          int up_available, int left_available,
+                                          int right_available, int x, int y,
+                                          int plane) {
+  int i;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  vpx_memset(left_col, 129, 64);
+
+  // left
+  if (left_available) {
+    if (xd->mb_to_bottom_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (y0 + bs <= frame_height) {
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      } else {
+        const int extend_bottom = frame_height - y0;
+        for (i = 0; i < extend_bottom; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+        for (; i < bs; ++i)
+          left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      for (i = 0; i < bs; ++i)
+        left_col[i] = ref[i * ref_stride - 1];
+    }
+  }
+
+  // TODO(hkuang) do not extend 2*bs pixels for all modes.
+  // above
+  if (up_available) {
+    const uint8_t *above_ref = ref - ref_stride;
+    if (xd->mb_to_right_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (x0 + 2 * bs <= frame_width) {
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, 2 * bs);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 + bs <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      /* faster path if the block does not need extension */
+      if (bs == 4 && right_available && left_available) {
+        const_above_row = above_ref;
+      } else {
+        vpx_memcpy(above_row, above_ref, bs);
+        if (bs == 4 && right_available)
+          vpx_memcpy(above_row + bs, above_ref + bs, bs);
+        else
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        above_row[-1] = left_available ? above_ref[-1] : 129;
+      }
+    }
+  } else {
+    vpx_memset(above_row, 127, bs * 2);
+    above_row[-1] = 127;
+  }
+
+  // predict
+  filter_intra_predictors_4tap(dst, dst_stride, bs, const_above_row, left_col,
+                               mode);
+}
+#endif
+
 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, PREDICTION_MODE mode,
+#if CONFIG_FILTERINTRA
+                             int filterbit,
+#endif
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride,
                             int aoff, int loff, int plane) {
@@ -456,8 +675,708 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
  const int have_right = ((block_idx & wmask) != wmask);
  const int x = aoff * 4;
  const int y = loff * 4;
+#if CONFIG_FILTERINTRA
+  const int filterflag = is_filter_allowed(mode) && is_filter_enabled(tx_size)
+                         && filterbit;
+#endif

  assert(bwl >= 0);
+#if CONFIG_FILTERINTRA
+  if (!filterflag) {
+#endif
  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
                         have_top, have_left, have_right, x, y, plane);
+#if CONFIG_FILTERINTRA
+  } else {
+    build_filter_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+                        tx_size, have_top, have_left, have_right, x, y, plane);
+  }
+#endif
 }
+
+#if CONFIG_INTERINTRA
+#if CONFIG_MASKED_INTERINTRA
+#define MASK_WEIGHT_BITS_INTERINTRA 6
+
+static int get_masked_weight_interintra(int m) {
+  #define SMOOTHER_LEN_INTERINTRA  32
+  static const uint8_t smoothfn[2 * SMOOTHER_LEN_INTERINTRA + 1] = {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  1,  1,  1,
+      1,  1,  2,  2,  3,  4,  5,  6,
+      8,  9, 12, 14, 17, 21, 24, 28,
+      32,
+      36, 40, 43, 47, 50, 52, 55, 56,
+      58, 59, 60, 61, 62, 62, 63, 63,
+      63, 63, 63, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+  };
+  if (m < -SMOOTHER_LEN_INTERINTRA)
+    return 0;
+  else if (m > SMOOTHER_LEN_INTERINTRA)
+    return (1 << MASK_WEIGHT_BITS_INTERINTRA);
+  else
+    return smoothfn[m + SMOOTHER_LEN_INTERINTRA];
+}
+
+static int get_hard_mask_interintra(int m) {
+  return m > 0;
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
+// The soft mask is obtained by computing f(x, y) and then calling
+// get_masked_weight(f(x, y)).
+static const int mask_params_sml_interintra[1 << MASK_BITS_SML_INTERINTRA]
+                                            [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+};
+
+static const int mask_params_med_hgtw_interintra[1 << MASK_BITS_MED_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+};
+
+static const int mask_params_med_hltw_interintra[1 << MASK_BITS_MED_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+};
+
+static const int mask_params_med_heqw_interintra[1 << MASK_BITS_MED_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_hgtw_interintra[1 << MASK_BITS_BIG_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+};
+
+static const int mask_params_big_hltw_interintra[1 << MASK_BITS_BIG_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_heqw_interintra[1 << MASK_BITS_BIG_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int *get_mask_params_interintra(int mask_index,
+                                             BLOCK_SIZE sb_type,
+                                             int h, int w) {
+  const int *a;
+  const int mask_bits = get_mask_bits_interintra(sb_type);
+
+  if (mask_index == MASK_NONE_INTERINTRA)
+    return NULL;
+
+  if (mask_bits == MASK_BITS_SML_INTERINTRA) {
+    a = mask_params_sml_interintra[mask_index];
+  } else if (mask_bits == MASK_BITS_MED_INTERINTRA) {
+    if (h > w)
+      a = mask_params_med_hgtw_interintra[mask_index];
+    else if (h < w)
+      a = mask_params_med_hltw_interintra[mask_index];
+    else
+      a = mask_params_med_heqw_interintra[mask_index];
+  } else if (mask_bits == MASK_BITS_BIG_INTERINTRA) {
+    if (h > w)
+      a = mask_params_big_hgtw_interintra[mask_index];
+    else if (h < w)
+      a = mask_params_big_hltw_interintra[mask_index];
+    else
+      a = mask_params_big_heqw_interintra[mask_index];
+  } else {
+    assert(0);
+  }
+  return a;
+}
+
+void vp9_generate_masked_weight_interintra(int mask_index,
+                                           BLOCK_SIZE sb_type,
+                                           int h, int w,
+                                           uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params_interintra(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight_interintra(m);
+    }
+}
+
+void vp9_generate_hard_mask_interintra(int mask_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params_interintra(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_hard_mask_interintra(m);
+    }
+}
+#endif
+
+static void combine_interintra(PREDICTION_MODE mode,
+#if CONFIG_MASKED_INTERINTRA
+                               int use_masked_interintra,
+                               int mask_index,
+                               BLOCK_SIZE bsize,
+#endif
+                               uint8_t *comppred,
+                               int compstride,
+                               uint8_t *interpred,
+                               int interstride,
+                               uint8_t *intrapred,
+                               int intrastride,
+                               int bw, int bh) {
+  static const int scale_bits = 8;
+  static const int scale_max = 256;
+  static const int scale_round = 127;
+  static const int weights1d[64] = {
+      128, 125, 122, 119, 116, 114, 111, 109,
+      107, 105, 103, 101,  99,  97,  96,  94,
+       93,  91,  90,  89,  88,  86,  85,  84,
+       83,  82,  81,  81,  80,  79,  78,  78,
+       77,  76,  76,  75,  75,  74,  74,  73,
+       73,  72,  72,  71,  71,  71,  70,  70,
+       70,  70,  69,  69,  69,  69,  68,  68,
+       68,  68,  68,  67,  67,  67,  67,  67,
+  };
+
+  int size = MAX(bw, bh);
+  int size_scale = (size >= 64 ? 1 :
+                    size == 32 ? 2 :
+                    size == 16 ? 4 :
+                    size == 8  ? 8 : 16);
+  int i, j;
+
+#if CONFIG_MASKED_INTERINTRA
+  if (use_masked_interintra && get_mask_bits_interintra(bsize)) {
+    uint8_t mask[4096];
+    vp9_generate_masked_weight_interintra(mask_index, bsize, bh, bw, mask, bw);
+    for (i = 0; i < bh; ++i) {
+      for (j = 0; j < bw; ++j) {
+        int m = mask[i * bw + j];
+        comppred[i * compstride + j] =
+            (intrapred[i * intrastride + j] * m +
+            interpred[i * interstride + j] *
+            ((1 << MASK_WEIGHT_BITS_INTERINTRA) - m) +
+            (1 << (MASK_WEIGHT_BITS_INTERINTRA - 1))) >>
+            MASK_WEIGHT_BITS_INTERINTRA;
+      }
+    }
+    return;
+  }
+#endif
+
+  switch (mode) {
+    case V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = weights1d[i * size_scale];
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+               scale * intrapred[i * intrastride + j] + scale_round)
+               >> scale_bits;
+        }
+      }
+     break;
+
+    case H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = weights1d[j * size_scale];
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+               scale * intrapred[i * intrastride + j] + scale_round)
+               >> scale_bits;
+        }
+      }
+     break;
+
+    case D63_PRED:
+    case D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (weights1d[i * size_scale] * 3 +
+                       weights1d[j * size_scale]) >> 2;
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case D207_PRED:
+    case D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (weights1d[j * size_scale] * 3 +
+                       weights1d[i * size_scale]) >> 2;
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = weights1d[(i < j ? i : j) * size_scale];
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (weights1d[i * size_scale] +
+                       weights1d[j * size_scale]) >> 1;
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case TM_PRED:
+    case DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+            comppred[i * compstride + j] = (interpred[i * interstride + j] +
+                intrapred[i * intrastride + j]) >> 1;
+        }
+      }
+      break;
+  }
+}
+
+
+static void build_intra_predictors_for_2nd_block_interintra
+                                  (const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+                                   int up_available, int left_available,
+                                   int right_available, int bwltbh,
+                                   int x, int y, int plane) {
+  int i;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const uint8_t *ref_fi;
+  int ref_stride_fi;
+
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  once(init_intra_pred_fn_ptrs);
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  vpx_memset(left_col, 129, 64);
+
+  // left
+  if (left_available) {
+    if (bwltbh) {
+      ref_fi = ref;
+      ref_stride_fi = ref_stride;
+    } else {
+      ref_fi = dst;
+      ref_stride_fi = dst_stride;
+    }
+    if (xd->mb_to_bottom_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (y0 + bs <= frame_height) {
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref_fi[i * ref_stride_fi - 1];
+      } else {
+        const int extend_bottom = frame_height - y0;
+        assert(extend_bottom >= 0);
+        for (i = 0; i < extend_bottom; ++i)
+          left_col[i] = ref_fi[i * ref_stride_fi - 1];
+        for (; i < bs; ++i)
+          left_col[i] = ref_fi[(extend_bottom - 1) * ref_stride_fi - 1];
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      for (i = 0; i < bs; ++i)
+        left_col[i] = ref_fi[i * ref_stride_fi - 1];
+    }
+  }
+
+  // TODO(hkuang) do not extend 2*bs pixels for all modes.
+  // above
+  if (up_available) {
+    const uint8_t *above_ref;
+    if (bwltbh) {
+      ref_fi = dst;
+      ref_stride_fi = dst_stride;
+      above_row[-1] = left_available ? ref[-ref_stride-1] : 129;
+    } else {
+      ref_fi = ref;
+      ref_stride_fi = ref_stride;
+      above_row[-1] = ref[-ref_stride-1];
+    }
+    above_ref = ref_fi - ref_stride_fi;
+    if (xd->mb_to_right_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (x0 + 2 * bs <= frame_width) {
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, 2 * bs);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 + bs <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 <= frame_width) {
+        const int r = frame_width - x0;
+        assert(r >= 0);
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        }
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      if (bs == 4 && right_available && left_available) {
+        const_above_row = above_ref;
+      } else {
+        vpx_memcpy(above_row, above_ref, bs);
+        if (bs == 4 && right_available)
+          vpx_memcpy(above_row + bs, above_ref + bs, bs);
+        else
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+      }
+    }
+  } else {
+    vpx_memset(above_row, 127, bs * 2);
+    above_row[-1] = 127;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
+                                                   const_above_row, left_col);
+  } else {
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
+  }
+}
+
+// Break down rectangular intra prediction for joint spatio-temporal prediction
+// into two square intra predictions.
+static void build_intra_predictors_for_interintra(MACROBLOCKD *xd,
+                                           uint8_t *src, int src_stride,
+                                           uint8_t *pred_ptr, int stride,
+                                           PREDICTION_MODE mode,
+                                           int bw, int bh,
+                                           int up_available, int left_available,
+                                           int right_available, int plane) {
+  if (bw == bh) {
+    build_intra_predictors(xd, src, src_stride, pred_ptr, stride,
+                           mode, intra_size_log2_for_interintra(bw),
+                           up_available, left_available, right_available,
+                           0, 0, plane);
+  } else if (bw < bh) {
+    uint8_t *src_bottom = src + bw * src_stride;
+    uint8_t *pred_ptr_bottom = pred_ptr + bw * stride;
+    build_intra_predictors(xd, src, src_stride, pred_ptr, stride,
+                           mode, intra_size_log2_for_interintra(bw),
+                           up_available, left_available, right_available,
+                           0, 0, plane);
+    build_intra_predictors_for_2nd_block_interintra(xd, src_bottom, src_stride,
+                           pred_ptr_bottom, stride,
+                           mode, intra_size_log2_for_interintra(bw),
+                           up_available, left_available, 0, 1,
+                           0, bw, plane);
+  } else {
+    uint8_t *src_right = src + bh;
+    uint8_t *pred_ptr_right = pred_ptr + bh;
+    build_intra_predictors(xd, src, src_stride, pred_ptr, stride,
+                           mode, intra_size_log2_for_interintra(bh),
+                           up_available, left_available, 1,
+                           0, 0, plane);
+    build_intra_predictors_for_2nd_block_interintra(xd, src_right, src_stride,
+                           pred_ptr_right, stride,
+                           mode, intra_size_log2_for_interintra(bh),
+                           up_available, left_available, right_available, 0,
+                           bh, 0, plane);
+  }
+}
+
+void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                         uint8_t *ypred,
+                                         int ystride,
+                                         BLOCK_SIZE bsize) {
+  int bw = 4 << b_width_log2(bsize);
+  int bh = 4 << b_height_log2(bsize);
+  uint8_t intrapredictor[4096];
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+      intrapredictor, bw,
+      xd->mi[0]->mbmi.interintra_mode, bw, bh,
+      xd->up_available, xd->left_available, 0, 0);
+  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
+#if CONFIG_MASKED_INTERINTRA
+                     xd->mi[0]->mbmi.use_masked_interintra,
+                     xd->mi[0]->mbmi.interintra_mask_index,
+                     bsize,
+#endif
+                     xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                     ypred, ystride, intrapredictor, bw, bw, bh);
+}
+
+void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          uint8_t *vpred,
+                                          int ustride, int vstride,
+                                          BLOCK_SIZE bsize) {
+  int bwl = b_width_log2(bsize), bw = 2 << bwl;
+  int bhl = b_height_log2(bsize), bh = 2 << bhl;
+  uint8_t uintrapredictor[1024];
+  uint8_t vintrapredictor[1024];
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[1].dst.buf, xd->plane[1].dst.stride,
+      uintrapredictor, bw,
+      xd->mi[0]->mbmi.interintra_uv_mode, bw, bh,
+      xd->up_available, xd->left_available, 0, 1);
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[2].dst.buf, xd->plane[1].dst.stride,
+      vintrapredictor, bw,
+      xd->mi[0]->mbmi.interintra_uv_mode, bw, bh,
+      xd->up_available, xd->left_available, 0, 2);
+  combine_interintra(xd->mi[0]->mbmi.interintra_uv_mode,
+#if CONFIG_MASKED_INTERINTRA
+                     xd->mi[0]->mbmi.use_masked_interintra,
+                     xd->mi[0]->mbmi.interintra_uv_mask_index,
+                     bsize,
+#endif
+                     xd->plane[1].dst.buf, xd->plane[1].dst.stride,
+                     upred, ustride, uintrapredictor, bw, bw, bh);
+  combine_interintra(xd->mi[0]->mbmi.interintra_uv_mode,
+#if CONFIG_MASKED_INTERINTRA
+                     xd->mi[0]->mbmi.use_masked_interintra,
+                     xd->mi[0]->mbmi.interintra_uv_mask_index,
+                     bsize,
+#endif
+                     xd->plane[2].dst.buf, xd->plane[2].dst.stride,
+                     vpred, vstride, vintrapredictor, bw, bw, bh);
+}
+
+void vp9_build_interintra_predictors(MACROBLOCKD *xd,
+                                     uint8_t *ypred,
+                                     uint8_t *upred,
+                                     uint8_t *vpred,
+                                     int ystride, int ustride, int vstride,
+                                     BLOCK_SIZE bsize) {
+  vp9_build_interintra_predictors_sby(xd, ypred, ystride, bsize);
+  vp9_build_interintra_predictors_sbuv(xd, upred, vpred,
+                                       ustride, vstride, bsize);
+}
+#endif
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -20,9 +20,37 @@ extern "C" {

 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, PREDICTION_MODE mode,
+#if CONFIG_FILTERINTRA
+                             int filterbit,
+#endif
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride,
                             int aoff, int loff, int plane);
+#if CONFIG_INTERINTRA
+void vp9_build_interintra_predictors(MACROBLOCKD *xd,
+                                     uint8_t *ypred,
+                                     uint8_t *upred,
+                                     uint8_t *vpred,
+                                     int ystride,
+                                     int ustride,
+                                     int vstride,
+                                     BLOCK_SIZE bsize);
+void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                         uint8_t *ypred,
+                                         int ystride,
+                                         BLOCK_SIZE bsize);
+void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          uint8_t *vpred,
+                                          int ustride, int vstride,
+                                          BLOCK_SIZE bsize);
+#if CONFIG_MASKED_INTERINTRA
+void vp9_generate_masked_weight_interintra(int mask_index,
+                                           BLOCK_SIZE sb_type,
+                                           int h, int w,
+                                           uint8_t *mask, int stride);
+#endif
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -305,15 +305,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
 $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;

 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_neon_asm=vp9_convolve8_neon;

 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;

 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;

 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -402,25 +402,25 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {

 # variance
 add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x16/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x32/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x64/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance32x32 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -447,10 +447,10 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
 specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -477,10 +477,10 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_
 specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -506,6 +506,125 @@ specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";

+if ((vpx_config("CONFIG_MASKED_INTERINTER") eq "yes") || ((vpx_config("CONFIG_INTERINTRA") eq "yes") && (vpx_config("CONFIG_MASKED_INTERINTRA") eq "yes"))) {
+add_proto qw/unsigned int vp9_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance32x16/;
+
+add_proto qw/unsigned int vp9_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masdctked_variance16x32/;
+
+add_proto qw/unsigned int vp9_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance64x32/;
+
+add_proto qw/unsigned int vp9_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance32x64/;
+
+add_proto qw/unsigned int vp9_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance32x32/;
+
+add_proto qw/unsigned int vp9_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance64x64/;
+
+add_proto qw/unsigned int vp9_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance16x16/;
+
+add_proto qw/unsigned int vp9_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance16x8/;
+
+add_proto qw/unsigned int vp9_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance8x16/;
+
+add_proto qw/unsigned int vp9_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance8x8/;
+
+add_proto qw/unsigned int vp9_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance8x4/;
+
+add_proto qw/unsigned int vp9_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance4x8/;
+
+add_proto qw/unsigned int vp9_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance4x4/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance64x64/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance32x64/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance64x32/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance32x16/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance16x32/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance32x32/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance16x16/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance8x16/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance16x8/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance8x8/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance8x4/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance4x8/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance4x4/;
+
+add_proto qw/unsigned int vp9_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad64x64/;
+
+add_proto qw/unsigned int vp9_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad32x64/;
+
+add_proto qw/unsigned int vp9_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad64x32/;
+
+add_proto qw/unsigned int vp9_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad32x16/;
+
+add_proto qw/unsigned int vp9_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad16x32/;
+
+add_proto qw/unsigned int vp9_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad32x32/;
+
+add_proto qw/unsigned int vp9_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad16x16/;
+
+add_proto qw/unsigned int vp9_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad16x8/;
+
+add_proto qw/unsigned int vp9_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad8x16/;
+
+add_proto qw/unsigned int vp9_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad8x8/;
+
+add_proto qw/unsigned int vp9_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad8x4/;
+
+add_proto qw/unsigned int vp9_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad4x8/;
+
+add_proto qw/unsigned int vp9_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad4x4/;
+}
+
 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
 add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -653,7 +772,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const
 specialize qw/vp9_sad4x4x8 sse4/;

 add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
+specialize qw/vp9_sad64x64x4d sse2/;

 add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad32x64x4d sse2/;
@@ -668,7 +787,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, co
 specialize qw/vp9_sad16x32x4d sse2/;

 add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
+specialize qw/vp9_sad32x32x4d sse2/;

 add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad16x16x4d sse2/;
@@ -693,7 +812,7 @@ add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, cons
 specialize qw/vp9_sad4x4x4d sse/;

 add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 specialize qw/vp9_mse8x16/;
@@ -739,19 +858,31 @@ add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int strid
 specialize qw/vp9_fht8x8 sse2 avx2/;

 add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2 avx2/;
+specialize qw/vp9_fht16x16 sse2/;

 add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fwht4x4/, "$mmx_x86inc";

+add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct4x4_1 sse2/;
+
 add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct4x4 sse2 avx2/;

+add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct8x8_1 sse2/;
+
 add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";

+add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct16x16_1 sse2/;
+
 add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2 avx2/;
+specialize qw/vp9_fdct16x16 sse2/;
+
+add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct32x32_1 sse2/;

 add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct32x32 sse2 avx2/;
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -46,8 +46,8 @@ static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
 }

 static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_NO_SCALE ||
-         sf->y_scale_fp != REF_NO_SCALE;
+  return vp9_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }

 #ifdef __cplusplus
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ b/vp9/common/x86/vp9_postproc_mmx.asm
@@ -464,7 +464,6 @@ sym(vp9_mbpost_proc_down_mmx):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int width, unsigned int height, int pitch)
-extern sym(rand)
 global sym(vp9_plane_add_noise_mmx) PRIVATE
 sym(vp9_plane_add_noise_mmx):
    push        rbp
@@ -476,7 +475,7 @@ sym(vp9_plane_add_noise_mmx):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -629,7 +629,6 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int width, unsigned int height, int pitch)
-extern sym(rand)
 global sym(vp9_plane_add_noise_wmt) PRIVATE
 sym(vp9_plane_add_noise_wmt):
    push        rbp
@@ -641,7 +640,7 @@ sym(vp9_plane_add_noise_wmt):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -254,11 +254,24 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
                                            : mi->mbmi.uv_mode;
  int x, y;
  uint8_t *dst;
+#if CONFIG_FILTERINTRA
+  int fbit;
+  if (plane == 0)
+    if (mi->mbmi.sb_type < BLOCK_8X8)
+      fbit = mi->b_filter_info[block];
+    else
+      fbit = is_filter_enabled(tx_size) ? mi->mbmi.filterbit : 0;
+  else
+    fbit = is_filter_enabled(tx_size) ? mi->mbmi.uv_filterbit : 0;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
  dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];

  vp9_predict_intra_block(xd, block >> (tx_size << 1),
                          b_width_log2(plane_bsize), tx_size, mode,
+#if CONFIG_FILTERINTRA
+                          fbit,
+#endif
                          dst, pd->dst.stride, dst, pd->dst.stride,
                          x, y, plane);

@@ -312,13 +325,6 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
    for (x = !y; x < x_mis; ++x)
      xd->mi[y * cm->mi_stride + x] = xd->mi[0];

-#if CONFIG_TRANSCODE && WRITE_MI_ARRAY
-  for (y = 0; y < y_mis; ++y)
-    for (x = !y; x < x_mis; ++x)
-      vpx_memcpy(&cm->mi[offset + y * cm->mi_stride + x],
-                 &cm->mi[offset], sizeof(MODE_INFO));
-#endif
-
  set_skip_context(xd, mi_row, mi_col);

  // Distance of Mb to the various image edges. These are specified to 8th pel
@@ -329,6 +335,84 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  return &xd->mi[0]->mbmi;
 }

+#if CONFIG_SUPERTX
+static void set_offsets_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               const TileInfo *const tile,
+                               BLOCK_SIZE top_bsize,
+                               int mi_row, int mi_col,
+                               int mi_row_ori, int mi_col_ori) {
+  const int bw = num_8x8_blocks_wide_lookup[top_bsize];
+  const int bh = num_8x8_blocks_high_lookup[top_bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  set_mi_row_col(xd, tile, mi_row_ori, bh, mi_col_ori, bw,
+                 cm->mi_rows, cm->mi_cols);
+}
+
+static void set_mb_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                           const TileInfo *const tile,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x)
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+}
+
+static void set_offsets_topblock(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+}
+
+static void set_param_topblock(VP9_COMMON *const cm,  MACROBLOCKD *const xd,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+#if CONFIG_EXT_TX
+                              int txfm,
+#endif
+                              int skip) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  for (y = 0; y < y_mis; ++y)
+    for (x = 0; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
+#if CONFIG_EXT_TX
+      xd->mi[y * cm->mi_stride + x]->mbmi.ext_txfrm = txfm;
+#endif
+    }
+}
+#endif
+
 static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                    int idx, int mi_row, int mi_col) {
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -342,14 +426,246 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  xd->corrupted |= ref_buffer->buf->corrupted;
 }

+#if CONFIG_SUPERTX
+static void dec_predict_b_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 int mi_row, int mi_col,
+                                 int mi_row_ori, int mi_col_ori,
+                                 BLOCK_SIZE top_bsize) {
+  set_offsets_extend(cm, xd, tile, top_bsize, mi_row, mi_col,
+                     mi_row_ori, mi_col_ori);
+
+  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
+  xd->mi[0]->mbmi.tx_size = b_width_log2(top_bsize);
+#if !CONFIG_MASKED_INTERINTER
+  vp9_dec_build_inter_predictors_sb(xd, mi_row_ori, mi_col_ori, top_bsize);
+#else
+  vp9_dec_build_inter_predictors_sb_extend(xd, mi_row, mi_col,
+                                           mi_row_ori, mi_col_ori, top_bsize);
+#endif
+}
+
+static void dec_predict_b_sub8x8_extend(VP9_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        const TileInfo *const tile,
+                                        int mi_row, int mi_col,
+                                        int mi_row_ori, int mi_col_ori,
+                                        BLOCK_SIZE top_bsize,
+                                        PARTITION_TYPE partition) {
+  set_offsets_extend(cm, xd, tile, top_bsize, mi_row, mi_col,
+                     mi_row_ori, mi_col_ori);
+
+  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
+  xd->mi[0]->mbmi.tx_size = b_width_log2(top_bsize);
+  vp9_dec_build_inter_predictors_sby_sub8x8_extend(xd, mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   top_bsize, partition);
+  vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(xd,
+#if CONFIG_MASKED_INTERINTER
+                                                    mi_row, mi_col,
+#endif
+                                                    mi_row_ori, mi_col_ori,
+                                                    top_bsize);
+}
+
+static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                   const TileInfo *const tile,
+                                   int mi_row, int mi_col,
+                                   int mi_row_ori, int mi_col_ori,
+                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                                   uint8_t *dst_buf[3], int dst_stride[3]) {
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  MB_MODE_INFO *mbmi;
+  int i, offset = mi_row * cm->mi_stride + mi_col;
+
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, MAX_MB_PLANE * 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, MAX_MB_PLANE * 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf3, MAX_MB_PLANE * 32 * 32);
+  uint8_t *dst_buf1[3] = {tmp_buf1, tmp_buf1 + 32 * 32, tmp_buf1 + 2 * 32 * 32};
+  uint8_t *dst_buf2[3] = {tmp_buf2, tmp_buf2 + 32 * 32, tmp_buf2 + 2 * 32 * 32};
+  uint8_t *dst_buf3[3] = {tmp_buf3, tmp_buf3 + 32 * 32, tmp_buf3 + 2 * 32 * 32};
+  int dst_stride1[3] = {32, 32, 32};
+  int dst_stride2[3] = {32, 32, 32};
+  int dst_stride3[3] = {32, 32, 32};
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  mbmi = &xd->mi[0]->mbmi;
+  partition = partition_lookup[bsl][mbmi->sb_type];
+  subsize = get_subsize(bsize, partition);
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
+                           top_bsize);
+      break;
+    case PARTITION_HORZ:
+      if (bsize > BLOCK_8X8) {
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
+                             mi_col_ori, top_bsize);
+      } else {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      }
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = tmp_buf1 + i * 32 * 32;
+          xd->plane[i].dst.stride = 32;
+        }
+        dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
+                             mi_row_ori, mi_col_ori, top_bsize);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = dst_buf[i];
+          xd->plane[i].dst.stride = dst_stride[i];
+          vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                   dst_buf1[i], dst_stride1[i],
+                                                   i,
+                                                   mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   bsize, top_bsize,
+                                                   PARTITION_HORZ);
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize > BLOCK_8X8) {
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
+                             mi_col_ori, top_bsize);
+      } else {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      }
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = tmp_buf1 + i * 32 * 32;
+          xd->plane[i].dst.stride = 32;
+        }
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs, mi_row_ori,
+                             mi_col_ori, top_bsize);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = dst_buf[i];
+          xd->plane[i].dst.stride = dst_stride[i];
+          vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                   dst_buf1[i], dst_stride1[i],
+                                                   i,
+                                                   mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   bsize, top_bsize,
+                                                   PARTITION_VERT);
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      } else {
+        dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col,
+                               mi_row_ori, mi_col_ori, subsize, top_bsize,
+                               dst_buf, dst_stride);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col + hbs,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf1, dst_stride1);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf2, dst_stride2);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col + hbs,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf3, dst_stride3);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+            vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                     dst_buf1[i],
+                                                     dst_stride1[i],
+                                                     i, mi_row, mi_col,
+                                                     mi_row_ori, mi_col_ori,
+                                                     bsize, top_bsize,
+                                                     PARTITION_VERT);
+            if (mi_row + hbs < cm->mi_rows) {
+              vp9_build_masked_inter_predictor_complex(dst_buf2[i],
+                                                       dst_stride2[i],
+                                                       dst_buf3[i],
+                                                       dst_stride3[i],
+                                                       i, mi_row, mi_col,
+                                                       mi_row_ori, mi_col_ori,
+                                                       bsize, top_bsize,
+                                                       PARTITION_VERT);
+              vp9_build_masked_inter_predictor_complex(dst_buf[i],
+                                                       dst_stride[i],
+                                                       dst_buf2[i],
+                                                       dst_stride2[i],
+                                                       i, mi_row, mi_col,
+                                                       mi_row_ori, mi_col_ori,
+                                                       bsize, top_bsize,
+                                                       PARTITION_HORZ);
+            }
+          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+            vp9_build_masked_inter_predictor_complex(dst_buf[i],
+                                                     dst_stride[i],
+                                                     dst_buf2[i],
+                                                     dst_stride2[i],
+                                                     i, mi_row, mi_col,
+                                                     mi_row_ori, mi_col_ori,
+                                                     bsize, top_bsize,
+                                                     PARTITION_HORZ);
+          }
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif
+
 static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                         const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
                         int mi_row, int mi_col,
                         vp9_reader *r, BLOCK_SIZE bsize) {
  const int less8x8 = bsize < BLOCK_8X8;
+#if !CONFIG_SUPERTX
  MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+#else
+  MB_MODE_INFO *mbmi;
+  if (!supertx_enabled) {
+    mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  } else {
+    set_mb_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  }
+#endif
+  vp9_read_mode_info(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r);

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  if (less8x8)
    bsize = BLOCK_8X8;

@@ -383,6 +699,9 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
        mbmi->skip = 1;  // skip loopfilter
    }
  }
+#if CONFIG_SUPERTX
+  }
+#endif

  xd->corrupted |= vp9_reader_has_error(r);
 }
@@ -413,38 +732,19 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,

 static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                             int read_token, int supertx_enabled,
+#endif
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize) {
  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
  PARTITION_TYPE partition;
  BLOCK_SIZE subsize;
-
-#if CONFIG_TRANSCODE && READ_MI_ARRAY
-  // This is for test purpose only. It verifies the external file
-  // contains the right mode_info array.
-  if (bsize == BLOCK_64X64) {
-    MODE_INFO mi_array[64];
-    FILE *pf = cm->mi_array_pf;
-    if (pf) {
-      int i, j;
-      for (j = 0; j < MI_BLOCK_SIZE; ++j)
-        for (i = 0; i < MI_BLOCK_SIZE; ++i)
-          fread(&mi_array[j * 8 + i], 1, sizeof(MODE_INFO), pf);
-    }
-
-    if (pf && mi_row == 0 && mi_col == 8) {
-      int i, j;
-      for (j = 0; j < MI_BLOCK_SIZE; ++j) {
-        for (i = 0; i < MI_BLOCK_SIZE; ++i) {
-          MB_MODE_INFO *mbmi = &mi_array[j * 8 + i].mbmi;
-          b_mode_info *bmi = mi_array[j * 8 + i].bmi;
-          fprintf(stderr, "pos (%d, %d), bsize %d, mode %d\n",
-                  mi_row + j , mi_col + i, mbmi->sb_type, bmi[0].as_mode);
-        }
-      }
-      fprintf(stderr, "\n");
-    }
-  }
+#if CONFIG_SUPERTX
+  int skip = 0;
+#if CONFIG_EXT_TX
+  int txfm = 0;
+#endif
 #endif

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
@@ -452,54 +752,145 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,

  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
  subsize = get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  if (cm->frame_type != KEY_FRAME &&
+      partition != PARTITION_NONE &&
+      bsize <= BLOCK_32X32 &&
+      !supertx_enabled) {
+    TX_SIZE supertx_size = b_width_log2(bsize);
+    if (partition == PARTITION_SPLIT) {
+      supertx_enabled = vp9_read(r, cm->fc.supertxsplit_prob[supertx_size]);
+      cm->counts.supertxsplit[supertx_size][supertx_enabled]++;
+    } else {
+      supertx_enabled = vp9_read(r, cm->fc.supertx_prob[supertx_size]);
+      cm->counts.supertx[supertx_size][supertx_enabled]++;
+    }
+  }
+  if (supertx_enabled && read_token) {
+    int offset = mi_row * cm->mi_stride + mi_col;
+    xd->mi = cm->mi_grid_visible + offset;
+    xd->mi[0] = &cm->mi[offset];
+    set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize],
+                   mi_col, num_8x8_blocks_wide_lookup[bsize],
+                   cm->mi_rows, cm->mi_cols);
+    set_skip_context(xd, mi_row, mi_col);
+    // Here we assume mbmi->segment_id = 0
+    skip = read_skip(cm, xd, 0, r);
+    if (skip)
+      reset_skip_context(xd, bsize);
+#if CONFIG_EXT_TX
+    if (bsize <= BLOCK_16X16 && !skip) {
+      txfm = vp9_read(r, cm->fc.ext_tx_prob);
+      if (!cm->frame_parallel_decoding_mode)
+        ++cm->counts.ext_tx[txfm];
+    }
+#endif
+  }
+#endif
  if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                 supertx_enabled,
+#endif
+                 mi_row, mi_col, r, subsize);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        break;
      case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif
+                       mi_row + hbs, mi_col, r, subsize);
        break;
      case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif
+                       mi_row, mi_col + hbs, r, subsize);
        break;
      case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row,       mi_col,       r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row,       mi_col + hbs, r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row + hbs, mi_col,       r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row + hbs, mi_col + hbs, r, subsize);
        break;
      default:
        assert(0 && "Invalid partition type");
    }
  }

+#if CONFIG_SUPERTX
+  if (supertx_enabled && read_token) {
+    uint8_t *dst_buf[3];
+    int dst_stride[3], i;
+
+    vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      dst_buf[i] = xd->plane[i].dst.buf;
+      dst_stride[i] = xd->plane[i].dst.stride;
+    }
+    dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col, mi_row, mi_col,
+                           bsize, bsize, dst_buf, dst_stride);
+
+    if (!skip) {
+      int eobtotal = 0;
+      struct inter_args arg = { cm, xd, r, &eobtotal };
+      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
+#if CONFIG_EXT_TX
+      xd->mi[0]->mbmi.ext_txfrm = txfm;
+#endif
+      vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      if (!(subsize < BLOCK_8X8) && eobtotal == 0)
+        skip = 1;
+    }
+    set_param_topblock(cm, xd, bsize, mi_row, mi_col,
+#if CONFIG_EXT_TX
+                       txfm,
+#endif
+                       skip);
+  }
+#endif
+
  // update partition context
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-
-#if CONFIG_TRANSCODE && WRITE_MI_ARRAY
-  if (bsize == BLOCK_64X64) {
-    FILE *pf = cm->mi_array_pf;
-    if (pf) {
-      int i, j;
-      int offset = mi_row * cm->mi_stride + mi_col;
-      for (j = 0; j < MI_BLOCK_SIZE; ++j)
-        for (i = 0; i < MI_BLOCK_SIZE; ++i)
-          fwrite(&cm->mi[offset + j * cm->mi_stride + i],
-                 1, sizeof(MODE_INFO), pf);
-    } else {
-      assert(0);
-    }
-  }
-#endif
 }

 static void setup_token_decoder(const uint8_t *data,
@@ -736,6 +1127,10 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
  while (max_ones-- && vp9_rb_read_bit(rb))
    cm->log2_tile_cols++;

+  if (cm->log2_tile_cols > 6)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid number of tile columns");
+
  // rows
  cm->log2_tile_rows = vp9_rb_read_bit(rb);
  if (cm->log2_tile_rows)
@@ -889,7 +1284,11 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        vp9_zero(tile_data->xd.left_seg_context);
        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
             mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
+          decode_partition(tile_data->cm, &tile_data->xd, &tile,
+#if CONFIG_SUPERTX
+                           1, 0,
+#endif
+                           mi_row, mi_col,
                           &tile_data->bit_reader, BLOCK_64X64);
        }
      }
@@ -943,6 +1342,9 @@ static int tile_worker_hook(void *arg1, void *arg2) {
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE) {
      decode_partition(tile_data->cm, &tile_data->xd, tile,
+#if CONFIG_SUPERTX
+                       1, 0,
+#endif
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
    }
  }
@@ -1128,7 +1530,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
    // Show an existing frame directly.
    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];

-    if (cm->frame_bufs[frame_to_show].ref_count < 1)
+    if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1)
      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                         "Buffer %d does not contain a decoded frame",
                         frame_to_show);
@@ -1293,6 +1695,62 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
        vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);

    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
+
+#if CONFIG_EXT_TX
+    vp9_diff_update_prob(&r, &fc->ext_tx_prob);
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      cm->use_masked_interinter = vp9_read_bit(&r);
+      if (cm->use_masked_interinter) {
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (get_mask_bits(i))
+            vp9_diff_update_prob(&r, &fc->masked_interinter_prob[i]);
+        }
+      }
+    } else {
+      cm->use_masked_interinter = 0;
+    }
+#endif
+
+#if CONFIG_INTERINTRA
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      cm->use_interintra = vp9_read_bit(&r);
+      if (cm->use_interintra) {
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (is_interintra_allowed(i)) {
+            vp9_diff_update_prob(&r, &fc->interintra_prob[i]);
+          }
+        }
+#if CONFIG_MASKED_INTERINTRA
+        cm->use_masked_interintra = vp9_read_bit(&r);
+        if (cm->use_masked_interintra) {
+          for (i = 0; i < BLOCK_SIZES; i++) {
+            if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
+              vp9_diff_update_prob(&r, &fc->masked_interintra_prob[i]);
+          }
+        }
+      } else {
+        cm->use_masked_interintra = 0;
+#endif
+      }
+    } else {
+      cm->use_interintra = 0;
+#if CONFIG_MASKED_INTERINTRA
+      cm->use_masked_interintra = 0;
+#endif
+    }
+#endif
+
+#if CONFIG_COPY_CODING
+    for (j = 0; j < COPY_MODE_CONTEXTS; j++) {
+      for (i = 0; i < 1; i++)
+        vp9_diff_update_prob(&r, &fc->copy_mode_probs_l2[j][i]);
+      for (i = 0; i < 2; i++)
+        vp9_diff_update_prob(&r, &fc->copy_mode_probs[j][i]);
+    }
+#endif
  }

  return vp9_reader_has_error(&r);
@@ -1344,6 +1802,10 @@ static void debug_check_frame_counts(const VP9_COMMON *const cm) {
  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
  assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
  assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
+#if CONFIG_EXT_TX
+  assert(!memcmp(cm->counts.ext_tx, zero_counts.ext_tx,
+                 sizeof(cm->counts.ext_tx)));
+#endif
 }
 #endif  // NDEBUG

--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -54,6 +54,36 @@ static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) {
  return NEARESTMV + mode;
 }

+#if CONFIG_COPY_CODING
+static COPY_MODE read_copy_mode(VP9_COMMON *cm, vp9_reader *r,
+                                int num_candidate, int ctx) {
+  COPY_MODE mode;
+
+  switch (num_candidate) {
+    case 0:
+      assert(0);
+      break;
+    case 1:
+      mode = REF0;
+      break;
+    case 2:
+      mode = REF0 + vp9_read_tree(r, vp9_copy_mode_tree_l2,
+                                  cm->fc.copy_mode_probs_l2[ctx]);
+      if (!cm->frame_parallel_decoding_mode)
+          ++cm->counts.copy_mode_l2[ctx][mode - REF0];
+      break;
+    default:
+      mode = REF0 + vp9_read_tree(r, vp9_copy_mode_tree,
+                                  cm->fc.copy_mode_probs[ctx]);
+      if (!cm->frame_parallel_decoding_mode)
+          ++cm->counts.copy_mode[ctx][mode - REF0];
+      break;
+  }
+
+  return mode;
+}
+#endif
+
 static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }
@@ -144,7 +174,11 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  return segment_id;
 }

+#if CONFIG_SUPERTX
+int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+#else
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+#endif
                     int segment_id, vp9_reader *r) {
  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
    return 1;
@@ -175,29 +209,85 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,

  switch (bsize) {
    case BLOCK_4X4:
+#if !CONFIG_FILTERINTRA
      for (i = 0; i < 4; ++i)
+#else
+      for (i = 0; i < 4; ++i) {
+#endif
        mi->bmi[i].as_mode =
            read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i));
+#if CONFIG_FILTERINTRA
+        if (is_filter_allowed(mi->bmi[i].as_mode))
+          mi->b_filter_info[i] =
+              vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[i].as_mode]);
+        else
+          mi->b_filter_info[i] = 0;
+      }
+      mbmi->filterbit = mi->b_filter_info[3];
+#endif
      mbmi->mode = mi->bmi[3].as_mode;
      break;
    case BLOCK_4X8:
      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode))
+        mi->b_filter_info[0] = mi->b_filter_info[2] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+      else
+        mi->b_filter_info[0] = mi->b_filter_info[2] = 0;
+#endif
      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[1].as_mode))
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[1].as_mode]);
+      else
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+#endif
      break;
    case BLOCK_8X4:
      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode))
+        mi->b_filter_info[0] = mi->b_filter_info[1] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+      else
+        mi->b_filter_info[0] = mi->b_filter_info[1] = 0;
+#endif
      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[2].as_mode))
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[2].as_mode]);
+      else
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+#endif
      break;
    default:
      mbmi->mode = read_intra_mode(r,
                                   get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+      if (is_filter_enabled(mbmi->tx_size) && is_filter_allowed(mbmi->mode))
+        mbmi->filterbit = vp9_read(r,
+                            cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode]);
+      else
+        mbmi->filterbit = 0;
+#endif
  }

  mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
+#if CONFIG_FILTERINTRA
+  if (is_filter_enabled(get_uv_tx_size(mbmi)) &&
+      is_filter_allowed(mbmi->uv_mode))
+    mbmi->uv_filterbit = vp9_read(r,
+        cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+  else
+    mbmi->uv_filterbit = 0;
+#endif
 }

 static int read_mv_component(vp9_reader *r,
@@ -335,25 +425,97 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,

  switch (bsize) {
    case BLOCK_4X4:
+#if !CONFIG_FILTERINTRA
      for (i = 0; i < 4; ++i)
+#else
+      for (i = 0; i < 4; ++i) {
+#endif
        mi->bmi[i].as_mode = read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+        if (is_filter_allowed(mi->bmi[i].as_mode)) {
+          mi->b_filter_info[i] =
+              vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[i].as_mode]);
+          cm->counts.filterintra[0][mi->bmi[i].as_mode]
+                                   [mi->b_filter_info[i]]++;
+        } else {
+          mi->b_filter_info[i] = 0;
+        }
+      }
+      mbmi->filterbit = mi->b_filter_info[3];
+#endif
      mbmi->mode = mi->bmi[3].as_mode;
      break;
    case BLOCK_4X8:
      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode)) {
+        mi->b_filter_info[0] = mi->b_filter_info[2] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[0].as_mode][mi->b_filter_info[0]]++;
+      } else {
+        mi->b_filter_info[0] = mi->b_filter_info[2] = 0;
+      }
+#endif
      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[1].as_mode)) {
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[1].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[1].as_mode][mi->b_filter_info[1]]++;
+      } else {
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+      }
+#endif
      break;
    case BLOCK_8X4:
      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode)) {
+        mi->b_filter_info[0] = mi->b_filter_info[1] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[0].as_mode][mi->b_filter_info[0]]++;
+      } else {
+        mi->b_filter_info[0] = mi->b_filter_info[1] = 0;
+      }
+#endif
      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[2].as_mode)) {
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[2].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[2].as_mode][mi->b_filter_info[2]]++;
+      } else {
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+      }
+#endif
      break;
    default:
      mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mbmi->mode) && is_filter_enabled(mbmi->tx_size)) {
+        mbmi->filterbit = vp9_read(r,
+            cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode]);
+        cm->counts.filterintra[mbmi->tx_size][mbmi->mode][mbmi->filterbit]++;
+      } else {
+        mbmi->filterbit = 0;
+      }
+#endif
  }

  mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+#if CONFIG_FILTERINTRA
+  if (is_filter_allowed(mbmi->uv_mode) &&
+      is_filter_enabled(get_uv_tx_size(mbmi))) {
+    mbmi->uv_filterbit = vp9_read(r,
+        cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+    cm->counts.filterintra[get_uv_tx_size(mbmi)]
+                           [mbmi->uv_mode][mbmi->uv_filterbit]++;
+  } else {
+    mbmi->uv_filterbit = 0;
+  }
+#endif
 }

 static INLINE int is_mv_valid(const MV *mv) {
@@ -422,6 +584,9 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
                                       MODE_INFO *const mi,
+#if CONFIG_SUPERTX && CONFIG_EXT_TX
+                                       int supertx_enabled,
+#endif
                                       int mi_row, int mi_col, vp9_reader *r) {
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -464,6 +629,37 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
                      ? read_switchable_interp_filter(cm, xd, r)
                      : cm->interp_filter;

+#if CONFIG_INTERINTRA
+    if ((cm->use_interintra) &&
+        is_interintra_allowed(bsize) &&
+        is_inter_mode(mbmi->mode) &&
+        (mbmi->ref_frame[1] <= INTRA_FRAME)) {
+      mbmi->ref_frame[1] = vp9_read(r, cm->fc.interintra_prob[bsize]) ?
+                           INTRA_FRAME : NONE;
+      cm->counts.interintra[bsize][mbmi->ref_frame[1] == INTRA_FRAME]++;
+#if CONFIG_MASKED_INTERINTRA
+      mbmi->use_masked_interintra = 0;
+#endif
+      if (mbmi->ref_frame[1] == INTRA_FRAME) {
+        mbmi->interintra_mode =
+            read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+        mbmi->interintra_uv_mode = mbmi->interintra_mode;
+#if CONFIG_MASKED_INTERINTRA
+        if (cm->use_masked_interintra && get_mask_bits_interintra(bsize)) {
+          mbmi->use_masked_interintra = vp9_read(r,
+                                          cm->fc.masked_interintra_prob[bsize]);
+          cm->counts.masked_interintra[bsize][mbmi->use_masked_interintra]++;
+          if (mbmi->use_masked_interintra) {
+            mbmi->interintra_mask_index = vp9_read_literal(r,
+                                               get_mask_bits_interintra(bsize));
+            mbmi->interintra_uv_mask_index = mbmi->interintra_mask_index;
+          }
+        }
+#endif
+      }
+    }
+#endif
+
  if (bsize < BLOCK_8X8) {
    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
@@ -497,10 +693,6 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
          mi->bmi[j + 2] = mi->bmi[j];
        if (num_4x4_w == 2)
          mi->bmi[j + 1] = mi->bmi[j];
-
-#if CONFIG_TRANSCODE
-        mi->bmi[j].as_mode = b_mode;
-#endif
      }
    }

@@ -512,35 +704,160 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
                                nearestmv, nearmv, is_compound, allow_hp, r);
  }
+#if CONFIG_MASKED_INTERINTER
+  mbmi->use_masked_interinter = 0;
+  if (cm->use_masked_interinter &&
+      cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_mode(mbmi->mode) &&
+      get_mask_bits(bsize) &&
+      mbmi->ref_frame[1] > INTRA_FRAME) {
+    mbmi->use_masked_interinter =
+        vp9_read(r, cm->fc.masked_interinter_prob[bsize]);
+    cm->counts.masked_interinter[bsize][mbmi->use_masked_interinter]++;
+    if (mbmi->use_masked_interinter) {
+      mbmi->mask_index = vp9_read_literal(r, get_mask_bits(bsize));
+    }
+  }
+#endif
 }

 static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                                       int supertx_enabled,
+#endif
                                       int mi_row, int mi_col, vp9_reader *r) {
  MODE_INFO *const mi = xd->mi[0];
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  int inter_block;
+#if CONFIG_COPY_CODING
+  int num_candidate = 0;
+  MB_MODE_INFO *inter_ref_list[18] = {NULL};
+#endif

  mbmi->mv[0].as_int = 0;
  mbmi->mv[1].as_int = 0;
+
+#if CONFIG_COPY_CODING
+  if (mbmi->sb_type >= BLOCK_8X8)
+    num_candidate = vp9_construct_ref_inter_list(cm, xd, mbmi->sb_type,
+                                               mi_row, mi_col, inter_ref_list);
+  if (mbmi->sb_type >= BLOCK_8X8 && num_candidate > 0) {
+    int ctx = vp9_get_copy_mode_context(xd);
+    int is_copy = vp9_read(r, cm->fc.copy_noref_prob[ctx][mbmi->sb_type]);
+
+    ++cm->counts.copy_noref[ctx][mbmi->sb_type][is_copy];
+    if (!is_copy) {
+      mbmi->copy_mode = NOREF;
+    } else {
+      mbmi->copy_mode = read_copy_mode(cm, r, num_candidate, ctx);
+    }
+  } else {
+    mbmi->copy_mode = NOREF;
+  }
+  if (mbmi->copy_mode != NOREF) {
+    BLOCK_SIZE bsize_backup = mbmi->sb_type;
+    int skip_backup = mbmi->skip;
+    COPY_MODE copy_mode_backup = mbmi->copy_mode;
+#if CONFIG_SUPERTX
+    TX_SIZE tx_size_backup = mbmi->tx_size;
+#endif
+#if CONFIG_EXT_TX
+    EXT_TX_TYPE ext_txfrm_backup = mbmi->ext_txfrm;
+#endif
+
+    inter_block = 1;
+    *mbmi = *inter_ref_list[mbmi->copy_mode - REF0];
+#if CONFIG_MASKED_INTERINTER
+    mbmi->use_masked_interinter = 0;
+#endif
+#if CONFIG_INTERINTRA
+    if (mbmi->ref_frame[1] == INTRA_FRAME)
+      mbmi->ref_frame[1] = NONE;
+#endif
+#if CONFIG_SUPERTX
+    mbmi->tx_size = tx_size_backup;
+#endif
+#if CONFIG_EXT_TX
+    mbmi->ext_txfrm = ext_txfrm_backup;
+#endif
+    mbmi->sb_type = bsize_backup;
+    mbmi->mode = NEARESTMV;
+    mbmi->skip = skip_backup;
+    mbmi->copy_mode = copy_mode_backup;
+  }
+#endif
+
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_COPY_CODING
+  if (mbmi->copy_mode == NOREF)
+#endif
  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
                               !mbmi->skip || !inter_block, r);
+#if CONFIG_EXT_TX
+  if (inter_block &&
+      mbmi->tx_size <= TX_16X16 &&
+      mbmi->sb_type >= BLOCK_8X8 &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
+      !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+      !mbmi->skip) {
+    mbmi->ext_txfrm = vp9_read(r, cm->fc.ext_tx_prob);
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.ext_tx[mbmi->ext_txfrm];
+  } else {
+    mbmi->ext_txfrm = NORM;
+  }
+#endif
+#if CONFIG_SUPERTX
+  } else {
+    const int ctx = vp9_get_intra_inter_context(xd);
+    mbmi->segment_id = 0;
+    inter_block = 1;
+    if (!cm->frame_parallel_decoding_mode)
+#if CONFIG_COPY_CODING
+      if (mbmi->copy_mode == NOREF)
+#endif
+      ++cm->counts.intra_inter[ctx][1];
+  }
+#endif

+#if CONFIG_COPY_CODING
+  if (mbmi->copy_mode == NOREF) {
+#endif
  if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(cm, xd, tile, mi,
+#if CONFIG_SUPERTX && CONFIG_EXT_TX
+                               supertx_enabled,
+#endif
+                               mi_row, mi_col, r);
  else
    read_intra_block_mode_info(cm, mi, r);
+#if CONFIG_COPY_CODING
+  }
+#endif
 }

 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                        const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
                        int mi_row, int mi_col, vp9_reader *r) {
  if (frame_is_intra_only(cm))
    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
  else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(cm, xd, tile,
+#if CONFIG_SUPERTX
+                               supertx_enabled,
+#endif
+                               mi_row, mi_col, r);
 }
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -21,8 +21,16 @@ struct TileInfo;

 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                        const struct TileInfo *const tile,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
                        int mi_row, int mi_col, vp9_reader *r);

+#if CONFIG_SUPERTX
+int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+              int segment_id, vp9_reader *r);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -32,14 +32,11 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_dthread.h"

-#include <stdio.h>
-
 static void initialize_dec() {
  static int init_done = 0;

  if (!init_done) {
    vp9_init_neighbors();
-    vp9_init_quant_tables();
    init_done = 1;
  }
 }
@@ -81,9 +78,6 @@ VP9Decoder *vp9_decoder_create() {

  vp9_worker_init(&pbi->lf_worker);

-#if CONFIG_TRANSCODE && WRITE_MI_ARRAY
-  cm->mi_array_pf = fopen("mode_info_array_2.bin", "rb");
-#endif
  return pbi;
 }

@@ -91,10 +85,6 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
  VP9_COMMON *const cm = &pbi->common;
  int i;

-#if CONFIG_TRANSCODE && WRITE_MI_ARRAY
-  fclose(cm->mi_array_pf);
-#endif
-
  vp9_remove_common(cm);
  vp9_worker_end(&pbi->lf_worker);
  vpx_free(pbi->lf_worker.data1);
@@ -220,7 +210,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
  }

  cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+
+  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    --cm->frame_bufs[cm->new_fb_idx].ref_count;
+  }

  // Invalidate these references until the next frame starts.
  for (ref_index = 0; ref_index < 3; ref_index++)
@@ -249,7 +242,9 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
  }

  // Check if the previous frame was a frame without any references to it.
-  if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
    cm->release_fb_cb(cm->cb_priv,
                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
  cm->new_fb_idx = get_free_fb(cm);
@@ -264,10 +259,10 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
    // TODO(jkoleszar): Error concealment is undefined and non-normative
    // at this point, but if it becomes so, [0] may not always be the correct
    // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX)
+    if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
      cm->frame_refs[0].buf->corrupted = 1;

-    if (cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
+    if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
      cm->frame_bufs[cm->new_fb_idx].ref_count--;

    return -1;
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -27,11 +27,6 @@
 extern "C" {
 #endif

-#if CONFIG_TRANSCODE
-#define WRITE_MI_ARRAY 0
-#define READ_MI_ARRAY  0
-#endif
-
 // TODO(hkuang): combine this with TileWorkerData.
 typedef struct TileData {
  VP9_COMMON *cm;
@@ -48,6 +43,8 @@ typedef struct VP9Decoder {

  int refresh_frame_flags;

+  int frame_parallel_decode;  // frame-based threading.
+
  VP9Worker lf_worker;
  VP9Worker *tile_workers;
  int num_tile_workers;
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -40,6 +40,23 @@ typedef struct VP9LfSyncData {
  int sync_range;
 } VP9LfSync;

+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+} FrameWorkerData;
+
 // Allocate memory for loopfilter row synchronization.
 void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
                           int rows, int width);
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vp9/decoder/vp9_read_bit_buffer.c
@@ -10,7 +10,7 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"

 size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);
+  return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
 }

 int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -38,12 +38,32 @@ static struct vp9_token intra_mode_encodings[INTRA_MODES];
 static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
 static struct vp9_token partition_encodings[PARTITION_TYPES];
 static struct vp9_token inter_mode_encodings[INTER_MODES];
+#if CONFIG_COPY_CODING
+static struct vp9_token copy_mode_encodings_l2[2];
+static struct vp9_token copy_mode_encodings[COPY_MODE_COUNT - 1];
+#endif
+
+#if CONFIG_SUPERTX
+static int vp9_check_supertx(VP9_COMMON *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize) {
+  MODE_INFO **mi;
+
+  mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+
+  return mi[0]->mbmi.tx_size == bsize_to_tx_size(bsize) &&
+         mi[0]->mbmi.sb_type < bsize;
+}
+#endif

 void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
+#if CONFIG_COPY_CODING
+  vp9_tokens_from_tree(copy_mode_encodings_l2, vp9_copy_mode_tree_l2);
+  vp9_tokens_from_tree(copy_mode_encodings, vp9_copy_mode_tree);
+#endif
 }

 static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
@@ -58,6 +78,21 @@ static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode,
                  &inter_mode_encodings[INTER_OFFSET(mode)]);
 }

+#if CONFIG_COPY_CODING
+static void write_copy_mode(VP9_COMMON *cm, vp9_writer *w, COPY_MODE mode,
+                            int inter_ref_count, int copy_mode_context) {
+  if (inter_ref_count == 2) {
+    vp9_write_token(w, vp9_copy_mode_tree_l2,
+                    cm->fc.copy_mode_probs_l2[copy_mode_context],
+                    &copy_mode_encodings_l2[mode - REF0]);
+  } else if (inter_ref_count > 2) {
+    vp9_write_token(w, vp9_copy_mode_tree,
+                    cm->fc.copy_mode_probs[copy_mode_context],
+                    &copy_mode_encodings[mode - REF0]);
+  }
+}
+#endif
+
 static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
                                int data, int max) {
  vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
@@ -225,6 +260,9 @@ static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
 }

 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
+#if CONFIG_SUPERTX
+                                int supertx_enabled,
+#endif
                                vp9_writer *w) {
  VP9_COMMON *const cm = &cpi->common;
  const nmv_context *nmvc = &cm->fc.nmvc;
@@ -239,7 +277,19 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
  const int is_inter = is_inter_block(mbmi);
  const int is_compound = has_second_ref(mbmi);
  int skip, ref;
+#if CONFIG_COPY_CODING
+  int copy_mode_context = vp9_get_copy_mode_context(xd);
+#endif

+#if CONFIG_COPY_CODING
+  if (bsize >= BLOCK_8X8 && mbmi->inter_ref_count > 0) {
+      vp9_write(w, mbmi->copy_mode != NOREF,
+                cm->fc.copy_noref_prob[copy_mode_context][bsize]);
+      if (mbmi->copy_mode != NOREF)
+        write_copy_mode(cm, w, mbmi->copy_mode, mbmi->inter_ref_count,
+                        copy_mode_context);
+  }
+#endif
  if (seg->update_map) {
    if (seg->temporal_update) {
      const int pred_flag = mbmi->seg_id_predicted;
@@ -252,20 +302,57 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
    }
  }

+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif
  skip = write_skip(cpi, segment_id, mi, w);
+#if CONFIG_SUPERTX
+  else
+    skip = mbmi->skip;
+#endif

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
+#if CONFIG_COPY_CODING
+  if (mbmi->copy_mode == NOREF)
+#endif
  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+#if CONFIG_SUPERTX
+  }
+#endif

  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
      !(is_inter &&
        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
  }
+#if CONFIG_EXT_TX
+    if (is_inter &&
+        mbmi->tx_size <= TX_16X16 &&
+        bsize >= BLOCK_8X8 &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif
+        !mbmi->skip &&
+        !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      vp9_write(w, mbmi->ext_txfrm, cm->fc.ext_tx_prob);
+    }
+#endif

  if (!is_inter) {
    if (bsize >= BLOCK_8X8) {
      write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mode) && is_filter_enabled(mbmi->tx_size)) {
+        vp9_write(w, mbmi->filterbit,
+                  cm->fc.filterintra_prob[mbmi->tx_size][mode]);
+      }
+#endif
    } else {
      int idx, idy;
      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -274,11 +361,28 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
        for (idx = 0; idx < 2; idx += num_4x4_w) {
          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
          write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
+#if CONFIG_FILTERINTRA
+          if (is_filter_allowed(b_mode)) {
+            vp9_write(w, mi->b_filter_info[idy * 2 + idx],
+                      cm->fc.filterintra_prob[0][b_mode]);
+          }
+#endif
        }
      }
    }
    write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+#if CONFIG_FILTERINTRA
+    if (is_filter_allowed(mbmi->uv_mode) &&
+        is_filter_enabled(get_uv_tx_size(mbmi))) {
+      vp9_write(w, mbmi->uv_filterbit,
+                cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+    }
+#endif
+#if !CONFIG_COPY_CODING
  } else {
+#else
+  } else if (mbmi->copy_mode == NOREF) {
+#endif
    const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
    const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
    write_ref_frames(cpi, w);
@@ -300,6 +404,32 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
      assert(mbmi->interp_filter == cm->interp_filter);
    }

+#if CONFIG_INTERINTRA
+    if ((cm->use_interintra) &&
+        cpi->common.reference_mode != COMPOUND_REFERENCE &&
+        is_interintra_allowed(bsize) &&
+        is_inter_mode(mode) &&
+        (mbmi->ref_frame[1] <= INTRA_FRAME)) {
+        vp9_write(w, mbmi->ref_frame[1] == INTRA_FRAME,
+                  cm->fc.interintra_prob[bsize]);
+        if (mbmi->ref_frame[1] == INTRA_FRAME) {
+          write_intra_mode(w, mbmi->interintra_mode,
+                           cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+#if CONFIG_MASKED_INTERINTRA
+        if (get_mask_bits_interintra(bsize) &&
+            cm->use_masked_interintra) {
+          vp9_write(w, mbmi->use_masked_interintra,
+                    cm->fc.masked_interintra_prob[bsize]);
+          if (mbmi->use_masked_interintra) {
+            vp9_write_literal(w, mbmi->interintra_mask_index,
+                              get_mask_bits_interintra(bsize));
+          }
+        }
+#endif
+      }
+    }
+#endif
+
    if (bsize < BLOCK_8X8) {
      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -326,6 +456,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                        allow_hp);
      }
    }
+#if CONFIG_MASKED_INTERINTER
+  if (cm->use_masked_interinter &&
+      cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_mode(mode) &&
+      get_mask_bits(mbmi->sb_type) &&
+      mbmi->ref_frame[1] > INTRA_FRAME) {
+    vp9_write(w, mbmi->use_masked_interinter,
+              cm->fc.masked_interinter_prob[bsize]);
+    if (mbmi->use_masked_interinter)
+      vp9_write_literal(w, mbmi->mask_index, get_mask_bits(mbmi->sb_type));
+  }
+#endif
  }
 }

@@ -350,6 +492,11 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,

  if (bsize >= BLOCK_8X8) {
    write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+    if (is_filter_allowed(mbmi->mode) && is_filter_enabled(mbmi->tx_size))
+      vp9_write(w, mbmi->filterbit,
+                cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode]);
+#endif
  } else {
    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -360,15 +507,29 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
        const int block = idy * 2 + idx;
        write_intra_mode(w, mi->bmi[block].as_mode,
                         get_y_mode_probs(mi, above_mi, left_mi, block));
+#if CONFIG_FILTERINTRA
+        if (is_filter_allowed(mi->bmi[block].as_mode))
+          vp9_write(w, mi->b_filter_info[block],
+                    cm->fc.filterintra_prob[0][mi->bmi[block].as_mode]);
+#endif
      }
    }
  }

  write_intra_mode(w, mbmi->uv_mode, vp9_kf_uv_mode_prob[mbmi->mode]);
+#if CONFIG_FILTERINTRA
+  if (is_filter_allowed(mbmi->uv_mode) &&
+      is_filter_enabled(get_uv_tx_size(mbmi)))
+    vp9_write(w, mbmi->uv_filterbit,
+              cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+#endif
 }

 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
                          vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+#if CONFIG_SUPERTX
+                          int supertx_enabled,
+#endif
                          int mi_row, int mi_col) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -384,11 +545,21 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
  if (frame_is_intra_only(cm)) {
    write_mb_modes_kf(cpi, xd->mi, w);
  } else {
+#if CONFIG_SUPERTX
+    pack_inter_mode_mvs(cpi, m, supertx_enabled, w);
+#else
    pack_inter_mode_mvs(cpi, m, w);
+#endif
  }

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  assert(*tok < tok_end);
  pack_mb_tokens(w, tok, tok_end);
+#if CONFIG_SUPERTX
+  }
+#endif
 }

 static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -415,6 +586,9 @@ static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
 static void write_modes_sb(VP9_COMP *cpi,
                           const TileInfo *const tile,
                           vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+#if CONFIG_SUPERTX
+                           int pack_token, int supertx_enabled,
+#endif
                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -431,36 +605,105 @@ static void write_modes_sb(VP9_COMP *cpi,
  partition = partition_lookup[bsl][m->mbmi.sb_type];
  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
  subsize = get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  set_mi_row_col(xd, tile,
+                 mi_row, num_8x8_blocks_high_lookup[bsize],
+                 mi_col, num_8x8_blocks_wide_lookup[bsize],
+                 cm->mi_rows, cm->mi_cols);
+  if (!supertx_enabled && cm->frame_type != KEY_FRAME &&
+      partition != PARTITION_NONE && bsize <= BLOCK_32X32) {
+    TX_SIZE supertx_size = bsize_to_tx_size(bsize);  // b_width_log2(bsize);
+    vp9_prob prob = partition == PARTITION_SPLIT ?
+                    cm->fc.supertxsplit_prob[supertx_size] :
+                    cm->fc.supertx_prob[supertx_size];
+    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+    vp9_write(w, supertx_enabled, prob);
+    if (supertx_enabled) {
+      vp9_write(w, xd->mi[0]->mbmi.skip, vp9_get_skip_prob(cm, xd));
+#if CONFIG_EXT_TX
+      if (supertx_size <= TX_16X16 && !xd->mi[0]->mbmi.skip)
+        vp9_write(w, xd->mi[0]->mbmi.ext_txfrm, cm->fc.ext_tx_prob);
+#endif
+    }
+  }
+#endif
  if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                  supertx_enabled,
+#endif
+                  mi_row, mi_col);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        break;
      case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        mi_row + bs, mi_col);
        break;
      case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        mi_row, mi_col + bs);
        break;
      case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row, mi_col + bs,
                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row + bs, mi_col,
                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row + bs, mi_col + bs,
                       subsize);
        break;
      default:
        assert(0);
    }
  }
+#if CONFIG_SUPERTX
+  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
+    assert(*tok < tok_end);
+    pack_mb_tokens(w, tok, tok_end);
+  }
+#endif

  // update partition context
  if (bsize >= BLOCK_8X8 &&
@@ -478,7 +721,11 @@ static void write_modes(VP9_COMP *cpi,
    vp9_zero(cpi->mb.e_mbd.left_seg_context);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+      write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                     1, 0,
+#endif
+                     mi_row, mi_col,
                     BLOCK_64X64);
  }
 }
@@ -890,14 +1137,8 @@ static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
 }

 static int get_refresh_mask(VP9_COMP *cpi) {
-    // Should the GF or ARF be updated using the transmitted frame or buffer
-#if CONFIG_MULTIPLE_ARF
-    if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
-        !cpi->refresh_alt_ref_frame) {
-#else
-    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
-        !cpi->use_svc) {
-#endif
+    if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+        cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
      // Preserve the previously existing golden frame and update the frame in
      // the alt ref slot instead. This is highly specific to the use of
      // alt-ref as a forward reference, and this needs to be generalized as
@@ -910,15 +1151,10 @@ static int get_refresh_mask(VP9_COMP *cpi) {
             (cpi->refresh_golden_frame << cpi->alt_fb_idx);
    } else {
      int arf_idx = cpi->alt_fb_idx;
-#if CONFIG_MULTIPLE_ARF
-      // Determine which ARF buffer to use to encode this ARF frame.
-      if (cpi->multi_arf_enabled) {
-        int sn = cpi->sequence_number;
-        arf_idx = (cpi->frame_coding_order[sn] < 0) ?
-            cpi->arf_buffer_idx[sn + 1] :
-            cpi->arf_buffer_idx[sn];
+      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        arf_idx = gf_group->arf_update_idx[gf_group->index];
      }
-#endif
      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
             (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
             (cpi->refresh_alt_ref_frame << arf_idx);
@@ -1187,6 +1423,104 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
                       cm->counts.partition[i], PARTITION_TYPES, &header_bc);

    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc);
+
+#if CONFIG_EXT_TX
+    vp9_cond_prob_diff_update(&header_bc, &fc->ext_tx_prob, cm->counts.ext_tx);
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      if (!cpi->dummy_packing && cm->use_masked_interinter) {
+        cm->use_masked_interinter = 0;
+        for (i = 0; i < BLOCK_SIZES; i++)
+          if (get_mask_bits(i) && (cm->counts.masked_interinter[i][1] > 0)) {
+            cm->use_masked_interinter = 1;
+            break;
+          }
+      }
+      vp9_write_bit(&header_bc, cm->use_masked_interinter);
+      if (cm->use_masked_interinter) {
+        for (i = 0; i < BLOCK_SIZES; i++)
+          if (get_mask_bits(i))
+            vp9_cond_prob_diff_update(&header_bc,
+                                      &fc->masked_interinter_prob[i],
+                                      cm->counts.masked_interinter[i]);
+      } else {
+        vp9_zero(cm->counts.masked_interinter);
+      }
+    } else {
+      if (!cpi->dummy_packing)
+        cm->use_masked_interinter = 0;
+      vp9_zero(cm->counts.masked_interinter);
+    }
+#endif
+
+#if CONFIG_INTERINTRA
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      if (!cpi->dummy_packing && cm->use_interintra) {
+        cm->use_interintra = 0;
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (is_interintra_allowed(i) && (cm->counts.interintra[i][1] > 0)) {
+            cm->use_interintra = 1;
+            break;
+          }
+        }
+      }
+      vp9_write_bit(&header_bc, cm->use_interintra);
+      if (cm->use_interintra) {
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (is_interintra_allowed(i)) {
+            vp9_cond_prob_diff_update(&header_bc,
+                                      &fc->interintra_prob[i],
+                                      cm->counts.interintra[i]);
+          }
+        }
+#if CONFIG_MASKED_INTERINTRA
+        if (!cpi->dummy_packing && cm->use_masked_interintra) {
+          cm->use_masked_interintra = 0;
+          for (i = 0; i < BLOCK_SIZES; i++) {
+            if (is_interintra_allowed(i) && get_mask_bits_interintra(i) &&
+                (cm->counts.masked_interintra[i][1] > 0)) {
+              cm->use_masked_interintra = 1;
+              break;
+            }
+          }
+        }
+        vp9_write_bit(&header_bc, cm->use_masked_interintra);
+        if (cm->use_masked_interintra) {
+          for (i = 0; i < BLOCK_SIZES; i++) {
+            if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
+              vp9_cond_prob_diff_update(&header_bc,
+                                        &fc->masked_interintra_prob[i],
+                                        cm->counts.masked_interintra[i]);
+          }
+        } else {
+          vp9_zero(cm->counts.masked_interintra);
+        }
+#endif
+      } else {
+        vp9_zero(cm->counts.interintra);
+      }
+    } else {
+      if (!cpi->dummy_packing)
+        cm->use_interintra = 0;
+      vp9_zero(cm->counts.interintra);
+#if CONFIG_MASKED_INTERINTRA
+      if (!cpi->dummy_packing)
+        cm->use_masked_interintra = 0;
+      vp9_zero(cm->counts.masked_interintra);
+#endif
+    }
+#endif
+
+#if CONFIG_COPY_CODING
+    for (i = 0; i < COPY_MODE_CONTEXTS; i++) {
+      prob_diff_update(vp9_copy_mode_tree_l2, cm->fc.copy_mode_probs_l2[i],
+                       cm->counts.copy_mode_l2[i], 2, &header_bc);
+      prob_diff_update(vp9_copy_mode_tree, cm->fc.copy_mode_probs[i],
+                       cm->counts.copy_mode[i], 3, &header_bc);
+    }
+#endif
  }

  vp9_stop_encode(&header_bc);
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -28,6 +28,7 @@ struct macroblock_plane {
  struct buf_2d src;

  // Quantizer setings
+  int16_t *quant_fp;
  int16_t *quant;
  int16_t *quant_shift;
  int16_t *zbin;
@@ -48,7 +49,7 @@ struct macroblock {

  MACROBLOCKD e_mbd;
  int skip_block;
-  int select_txfm_size;
+  int select_tx_size;
  int skip_recode;
  int skip_optimize;
  int q_index;
@@ -92,8 +93,6 @@ struct macroblock {

  int encode_breakout;

-  int in_active_map;
-
  // note that token_costs is the cost when eob node is skipped
  vp9_coeff_cost token_costs[TX_SIZES];

@@ -105,6 +104,9 @@ struct macroblock {
  int use_lp32x32fdct;
  int skip_encode;

+  // skip forward transform and quantization
+  int skip_txfm;
+
  // Used to store sub partition's choices.
  MV pred_mv[MAX_REF_FRAMES];

--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -100,15 +100,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
  vpx_free(cpi->leaf_tree);
  CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes,
                                                 sizeof(*cpi->leaf_tree)));
-#if CONFIG_TRANSCODE
-  vpx_memset(cpi->leaf_tree, 0, sizeof(*cpi->leaf_tree));
-#endif
  vpx_free(cpi->pc_tree);
  CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes,
                                               sizeof(*cpi->pc_tree)));
-#if CONFIG_TRANSCODE
-  vpx_memset(cpi->pc_tree, 0, sizeof(*cpi->pc_tree));
-#endif
+
  this_pc = &cpi->pc_tree[0];
  this_leaf = &cpi->leaf_tree[0];

--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -33,6 +33,7 @@ typedef struct {
  int is_coded;
  int num_4x4_blk;
  int skip;
+  int skip_txfm;
  int best_mode_index;
  int hybrid_pred_diff;
  int comp_pred_diff;
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -43,6 +43,17 @@ static void fdct4(const int16_t *input, int16_t *output) {
  output[3] = fdct_round_shift(temp2);
 }

+void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
+  int r, c;
+  int16_t sum = 0;
+  for (r = 0; r < 4; ++r)
+    for (c = 0; c < 4; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum << 1;
+  output[1] = 0;
+}
+
 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
@@ -240,6 +251,17 @@ static void fdct8(const int16_t *input, int16_t *output) {
  output[7] = fdct_round_shift(t3);
 }

+void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
+  int r, c;
+  int16_t sum = 0;
+  for (r = 0; r < 8; ++r)
+    for (c = 0; c < 8; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum;
+  output[1] = 0;
+}
+
 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
  int i, j;
  int16_t intermediate[64];
@@ -311,6 +333,17 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
  }
 }

+void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
+  int r, c;
+  int16_t sum = 0;
+  for (r = 0; r < 16; ++r)
+    for (c = 0; c < 16; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 1;
+  output[1] = 0;
+}
+
 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
@@ -1329,6 +1362,17 @@ static void fdct32(const int *input, int *output, int round) {
  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }

+void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
+  int r, c;
+  int16_t sum = 0;
+  for (r = 0; r < 32; ++r)
+    for (c = 0; c < 32; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 3;
+  output[1] = 0;
+}
+
 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
  int i, j;
  int output[32 * 32];
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_denoiser.h"
+
+static const int widths[]  = {4, 4, 8, 8,  8, 16, 16, 16, 32, 32, 32, 64, 64};
+static const int heights[] = {4, 8, 4, 8, 16,  8, 16, 32, 16, 32, 64, 32, 64};
+
+int vp9_denoiser_filter() {
+  return 0;
+}
+
+static int update_running_avg(const uint8_t *mc_avg, int mc_avg_stride,
+                              uint8_t *avg, int avg_stride,
+                              const uint8_t *sig, int sig_stride,
+                              int increase_denoising, BLOCK_SIZE bs) {
+  int r, c;
+  int diff, adj, absdiff;
+  int shift_inc1 = 0, shift_inc2 = 1;
+  int adj_val[] = {3, 4, 6};
+  int total_adj = 0;
+
+  if (increase_denoising) {
+    shift_inc1 = 1;
+    shift_inc2 = 2;
+  }
+
+  for (r = 0; r < heights[bs]; ++r) {
+    for (c = 0; c < widths[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      absdiff = abs(diff);
+
+      if (absdiff <= 3 + shift_inc1) {
+        avg[c] = mc_avg[c];
+        total_adj += diff;
+      } else {
+        switch (absdiff) {
+          case 4: case 5: case 6: case 7:
+            adj = adj_val[0];
+            break;
+          case 8: case 9: case 10: case 11:
+          case 12: case 13: case 14: case 15:
+            adj = adj_val[1];
+            break;
+          default:
+            adj = adj_val[2];
+        }
+        if (diff > 0) {
+          avg[c] = MIN(UINT8_MAX, sig[c] + adj);
+          total_adj += adj;
+        } else {
+          avg[c] = MAX(0, sig[c] - adj);
+          total_adj -= adj;
+        }
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+  return total_adj;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride,
+                            int mi_row, int mi_col) {
+  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+}
+
+void copy_block(uint8_t *dest, int dest_stride,
+                uint8_t *src, int src_stride, BLOCK_SIZE bs) {
+  int r, c;
+  for (r = 0; r < heights[bs]; ++r) {
+    for (c = 0; c < widths[bs]; ++c) {
+      dest[c] = src[c];
+    }
+    dest += dest_stride;
+    src += src_stride;
+  }
+}
+
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+                          int mi_row, int mi_col, BLOCK_SIZE bs) {
+  int decision = COPY_BLOCK;
+
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+  uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
+                                          mi_row, mi_col);
+  struct buf_2d src = mb->plane[0].src;
+
+
+  update_running_avg(mc_avg_start, mc_avg.y_stride, avg_start, avg.y_stride,
+                     mb->plane[0].src.buf, mb->plane[0].src.stride, 0, bs);
+
+  if (decision == FILTER_BLOCK) {
+    // TODO(tkopp)
+  }
+  if (decision == COPY_BLOCK) {
+    copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+  }
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
+  int r, c;
+  const uint8_t *srcbuf = src.y_buffer;
+  uint8_t *destbuf = dest.y_buffer;
+  assert(dest.y_width == src.y_width);
+  assert(dest.y_height == src.y_height);
+
+  for (r = 0; r < dest.y_height; ++r) {
+    for (c = 0; c < dest.y_width; ++c) {
+      destbuf[c] = srcbuf[c];
+    }
+    destbuf += dest.y_stride;
+    srcbuf += src.y_stride;
+  }
+}
+
+void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
+                                    FRAME_TYPE frame_type,
+                                    int refresh_alt_ref_frame,
+                                    int refresh_golden_frame,
+                                    int refresh_last_frame) {
+  if (frame_type == KEY_FRAME) {
+    int i;
+    copy_frame(denoiser->running_avg_y[LAST_FRAME], src);
+    for (i = 2; i < MAX_REF_FRAMES - 1; i++) {
+      copy_frame(denoiser->running_avg_y[i],
+                 denoiser->running_avg_y[LAST_FRAME]);
+    }
+  } else {  /* For non key frames */
+    if (refresh_alt_ref_frame) {
+      copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_golden_frame) {
+      copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_last_frame) {
+      copy_frame(denoiser->running_avg_y[LAST_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+  }
+}
+
+void vp9_denoiser_update_frame_stats() {
+}
+
+int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
+                       int ssx, int ssy, int border) {
+  int i, fail;
+  assert(denoiser != NULL);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
+                                  ssx, ssy, border);
+    if (fail) {
+      vp9_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+
+  fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
+                                ssx, ssy, border);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
+  }
+
+  return 0;
+}
+
+void vp9_denoiser_free(VP9_DENOISER *denoiser) {
+  int i;
+  if (denoiser == NULL) {
+    return;
+  }
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    if (&denoiser->running_avg_y[i] != NULL) {
+      vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
+    }
+  }
+  if (&denoiser->mc_running_avg_y != NULL) {
+    vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
+  }
+}
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_DENOISER_H_
+#define VP9_ENCODER_DENOISER_H_
+
+#include "vp9/encoder/vp9_block.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum vp9_denoiser_decision {
+  COPY_BLOCK,
+  FILTER_BLOCK
+};
+
+typedef struct vp9_denoiser {
+  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
+  YV12_BUFFER_CONFIG mc_running_avg_y;
+} VP9_DENOISER;
+
+void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
+                                    FRAME_TYPE frame_type,
+                                    int refresh_alt_ref_frame,
+                                    int refresh_golden_frame,
+                                    int refresh_last_frame);
+
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+                          int mi_row, int mi_col, BLOCK_SIZE bs);
+
+void vp9_denoiser_update_frame_stats();
+
+int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
+                       int ssx, int ssy, int border);
+
+void vp9_denoiser_free(VP9_DENOISER *denoiser);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_DENOISER_H_
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -301,6 +301,52 @@ static INLINE void fdct32x32(int rd_transform,
    vp9_fdct32x32(src, dst, src_stride);
 }

+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
+  const int16_t *src_diff;
+
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+  switch (tx_size) {
+    case TX_32X32:
+      vp9_fdct32x32_1(src_diff, coeff, diff_stride);
+      vp9_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                            p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
+      break;
+    case TX_16X16:
+      vp9_fdct16x16_1(src_diff, coeff, diff_stride);
+      vp9_quantize_dc(coeff, x->skip_block, p->round,
+                     p->quant_fp[0], qcoeff, dqcoeff,
+                     pd->dequant[0], eob);
+      break;
+    case TX_8X8:
+      vp9_fdct8x8_1(src_diff, coeff, diff_stride);
+      vp9_quantize_dc(coeff, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_dc(coeff, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    default:
+      assert(0);
+  }
+}
+
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
  MACROBLOCKD *const xd = &x->e_mbd;
@@ -314,6 +360,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
  int i, j;
  const int16_t *src_diff;
+#if CONFIG_EXT_TX
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  src_diff = &p->src_diff[4 * (j * diff_stride + i)];

@@ -326,21 +375,45 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                           scan_order->iscan);
      break;
    case TX_16X16:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_fdct16x16(src_diff, coeff, diff_stride);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_fht16x16(src_diff, coeff, diff_stride, ADST_ADST);
+      }
+#endif
      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob,
                     scan_order->scan, scan_order->iscan);
      break;
    case TX_8X8:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_fdct8x8(src_diff, coeff, diff_stride);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_fht8x8(src_diff, coeff, diff_stride, ADST_ADST);
+      }
+#endif
      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob,
                     scan_order->scan, scan_order->iscan);
      break;
    case TX_4X4:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_fht4x4(src_diff, coeff, diff_stride, ADST_ADST);
+      }
+#endif
      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob,
@@ -363,6 +436,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
  int i, j;
  uint8_t *dst;
  ENTROPY_CONTEXT *a, *l;
+#if CONFIG_EXT_TX
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
  a = &ctx->ta[plane][i];
@@ -376,8 +452,19 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
    return;
  }

-  if (!x->skip_recode)
-    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+  if (x->skip_txfm == 0) {
+    // full forward transform and quantization
+    if (!x->skip_recode)
+      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+  } else if (x->skip_txfm == 2) {
+    // fast path forward transform and quantization
+    vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+  } else {
+    // skip forward transform
+    p->eobs[block] = 0;
+    *a = *l = 0;
+    return;
+  }

  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
    const int ctx = combine_entropy_contexts(*a, *l);
@@ -397,16 +484,43 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
      break;
    case TX_16X16:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_iht16x16_add(ADST_ADST, dqcoeff, dst, pd->dst.stride,
+                         p->eobs[block]);
+      }
+#endif
      break;
    case TX_8X8:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_iht8x8_add(ADST_ADST, dqcoeff, dst, pd->dst.stride,
+                       p->eobs[block]);
+      }
+#endif
      break;
    case TX_4X4:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      // this is like vp9_short_idct4x4 but has a special case around eob<=1
      // which is significant (not just an optimization) for the lossless
      // case.
      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_iht4x4_add(ADST_ADST, dqcoeff, dst, pd->dst.stride,
+                       p->eobs[block]);
+      }
+#endif
      break;
    default:
      assert(0 && "Invalid transform size");
@@ -422,6 +536,10 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
  int i, j;
  uint8_t *dst;
+#if CONFIG_EXT_TX
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  mbmi->ext_txfrm = NORM;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];

@@ -460,6 +578,26 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
  }
 }

+#if CONFIG_SUPERTX
+void vp9_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    BLOCK_SIZE plane_size = bsize - 3 * (plane > 0);
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+    vp9_subtract_plane(x, bsize, plane);
+    vp9_get_entropy_contexts(bsize, tx_size, pd,
+                             ctx.ta[plane], ctx.tl[plane]);
+    encode_block(plane, 0, plane_size, bsize_to_tx_size(plane_size), &arg);
+  }
+}
+#endif
+
 static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               TX_SIZE tx_size, void *arg) {
  struct encode_b_args* const args = arg;
@@ -474,6 +612,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  const scan_order *scan_order;
  TX_TYPE tx_type;
  PREDICTION_MODE mode;
+#if CONFIG_FILTERINTRA
+  int fbit = 0;
+#endif
  const int bwl = b_width_log2(plane_bsize);
  const int diff_stride = 4 * (1 << bwl);
  uint8_t *src, *dst;
@@ -487,11 +628,20 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  src = &p->src.buf[4 * (j * src_stride + i)];
  src_diff = &p->src_diff[4 * (j * diff_stride + i)];

+#if CONFIG_FILTERINTRA
+      if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
+        fbit = xd->mi[0]->b_filter_info[block];
+      else
+        fbit = plane == 0 ? mbmi->filterbit : mbmi->uv_filterbit;
+#endif
  switch (tx_size) {
    case TX_32X32:
      scan_order = &vp9_default_scan_orders[TX_32X32];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
@@ -512,6 +662,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
      scan_order = &vp9_scan_orders[TX_16X16][tx_type];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
@@ -532,6 +685,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
      scan_order = &vp9_scan_orders[TX_8X8][tx_type];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
@@ -552,6 +708,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
      scan_order = &vp9_scan_orders[TX_4X4][tx_type];
      mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -21,8 +21,12 @@ extern "C" {
 #endif

 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp9_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
-
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);

--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -216,7 +216,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,

  // If auto_mv_step_size is enabled then keep track of the largest
  // motion vector component used.
-  if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) {
+  if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) {
    unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
    cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
  }
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -32,10 +32,14 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_variance.h"
+#if CONFIG_DENOISING
+#include "vp9/encoder/vp9_denoiser.h"
+#endif

 #ifdef __cplusplus
 extern "C" {
@@ -43,9 +47,6 @@ extern "C" {

 #define DEFAULT_GF_INTERVAL         10

-#define MAX_MODES 30
-#define MAX_REFS  6
-
 typedef struct {
  int nmvjointcost[MV_JOINTS];
  int nmvcosts[2][MV_VALS];
@@ -63,57 +64,6 @@ typedef struct {
  FRAME_CONTEXT fc;
 } CODING_CONTEXT;

-// This enumerator type needs to be kept aligned with the mode order in
-// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
-typedef enum {
-  THR_NEARESTMV,
-  THR_NEARESTA,
-  THR_NEARESTG,
-
-  THR_DC,
-
-  THR_NEWMV,
-  THR_NEWA,
-  THR_NEWG,
-
-  THR_NEARMV,
-  THR_NEARA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARESTGA,
-
-  THR_TM,
-
-  THR_COMP_NEARLA,
-  THR_COMP_NEWLA,
-  THR_NEARG,
-  THR_COMP_NEARGA,
-  THR_COMP_NEWGA,
-
-  THR_ZEROMV,
-  THR_ZEROG,
-  THR_ZEROA,
-  THR_COMP_ZEROLA,
-  THR_COMP_ZEROGA,
-
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D207_PRED,
-  THR_D153_PRED,
-  THR_D63_PRED,
-  THR_D117_PRED,
-  THR_D45_PRED,
-} THR_MODES;
-
-typedef enum {
-  THR_LAST,
-  THR_GOLD,
-  THR_ALTR,
-  THR_COMP_LA,
-  THR_COMP_GA,
-  THR_INTRA,
-} THR_MODES_SUB8X8;
-
 typedef enum {
  // encode_breakout is disabled.
  ENCODE_BREAKOUT_DISABLED = 0,
@@ -130,13 +80,6 @@ typedef enum {
  ONETWO      = 3
 } VPX_SCALING;

-typedef enum {
-  RC_MODE_VBR = 0,
-  RC_MODE_CBR = 1,
-  RC_MODE_CONSTRAINED_QUALITY = 2,
-  RC_MODE_CONSTANT_QUALITY    = 3,
-} RC_MODE;
-
 typedef enum {
  // Good Quality Fast Encoding. The encoder balances quality with the
  // amount of time it takes to encode the output. (speed setting
@@ -209,7 +152,8 @@ typedef struct VP9EncoderConfig {
  // ----------------------------------------------------------------
  // DATARATE CONTROL OPTIONS

-  RC_MODE rc_mode;  // vbr, cbr, constrained quality or constant quality
+  // vbr, cbr, constrained quality or constant quality
+  enum vpx_rc_mode rc_mode;

  // buffer targeting aggressiveness
  int under_shoot_pct;
@@ -238,8 +182,6 @@ typedef struct VP9EncoderConfig {
  // Enable feature to reduce the frame quantization every x frames.
  int frame_periodic_boost;

-  int kf_extern_coding;
-
  // two pass datarate control
  int two_pass_vbrbias;        // two pass datarate control tweaks
  int two_pass_vbrmin_section;
@@ -286,6 +228,10 @@ typedef struct VP9EncoderConfig {
  vp8e_tuning tuning;
 } VP9EncoderConfig;

+static INLINE int is_altref_enabled(const VP9EncoderConfig *cfg) {
+  return cfg->mode != REALTIME && cfg->play_alternate && cfg->lag_in_frames > 0;
+}
+
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
@@ -294,32 +240,6 @@ static INLINE int is_best_mode(MODE mode) {
  return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
 }

-typedef struct RD_OPT {
-  // Thresh_mult is used to set a threshold for the rd score. A higher value
-  // means that we will accept the best mode so far more often. This number
-  // is used in combination with the current block size, and thresh_freq_fact
-  // to pick a threshold.
-  int thresh_mult[MAX_MODES];
-  int thresh_mult_sub8x8[MAX_REFS];
-
-  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
-  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-
-  int64_t comp_pred_diff[REFERENCE_MODES];
-  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
-  int64_t tx_select_diff[TX_MODES];
-  // FIXME(rbultje) can this overflow?
-  int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
-
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t mask_filter;
-
-  int RDMULT;
-  int RDDIV;
-} RD_OPT;
-
 typedef struct VP9_COMP {
  QUANTS quants;
  MACROBLOCK mb;
@@ -327,11 +247,7 @@ typedef struct VP9_COMP {
  VP9EncoderConfig oxcf;
  struct lookahead_ctx    *lookahead;
  struct lookahead_entry  *source;
-#if CONFIG_MULTIPLE_ARF
-  struct lookahead_entry  *alt_ref_source[REF_FRAMES];
-#else
  struct lookahead_entry  *alt_ref_source;
-#endif
  struct lookahead_entry  *last_source;

  YV12_BUFFER_CONFIG *Source;
@@ -350,9 +266,6 @@ typedef struct VP9_COMP {
  int gld_fb_idx;
  int alt_fb_idx;

-#if CONFIG_MULTIPLE_ARF
-  int alt_ref_fb_idx[REF_FRAMES - 3];
-#endif
  int refresh_last_frame;
  int refresh_golden_frame;
  int refresh_alt_ref_frame;
@@ -370,13 +283,6 @@ typedef struct VP9_COMP {
  TOKENEXTRA *tok;
  unsigned int tok_count[4][1 << 6];

-#if CONFIG_MULTIPLE_ARF
-  // Position within a frame coding order (including any additional ARF frames).
-  unsigned int sequence_number;
-  // Next frame in naturally occurring order that has not yet been coded.
-  int next_frame_in_order;
-#endif
-
  // Ambient reconstruction err target for force key frames
  int ambient_err;

@@ -426,9 +332,6 @@ typedef struct VP9_COMP {

  unsigned char *complexity_map;

-  unsigned char *active_map;
-  unsigned int active_map_enabled;
-
  CYCLIC_REFRESH *cyclic_refresh;

  fractional_mv_step_fp *find_fractional_mv_step;
@@ -506,23 +409,31 @@ typedef struct VP9_COMP {
  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_COPY_CODING
+  int copy_mode_cost_l2[COPY_MODE_CONTEXTS][2];
+  int copy_mode_cost[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 1];
+#endif

  PICK_MODE_CONTEXT *leaf_tree;
  PC_TREE *pc_tree;
  PC_TREE *pc_root;
  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];

-#if CONFIG_MULTIPLE_ARF
-  // ARF tracking variables.
+  int multi_arf_allowed;
  int multi_arf_enabled;
-  unsigned int frame_coding_order_period;
-  unsigned int new_frame_coding_order_period;
-  int frame_coding_order[MAX_LAG_BUFFERS * 2];
-  int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2];
-  int arf_weight[MAX_LAG_BUFFERS];
-  int arf_buffered;
-  int this_frame_weight;
-  int max_arf_level;
+
+#if CONFIG_DENOISING
+  VP9_DENOISER denoiser;
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+  unsigned int masked_interinter_select_counts[2];
+#endif
+#if CONFIG_INTERINTRA
+  unsigned int interintra_select_count[2];
+#if CONFIG_MASKED_INTERINTRA
+  unsigned int masked_interintra_select_count[2];
+#endif
 #endif
 } VP9_COMP;

@@ -619,10 +530,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi);

 int64_t vp9_rescale(int64_t val, int64_t num, int denom);

+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                          YV12_BUFFER_CONFIG *unscaled,
                                          YV12_BUFFER_CONFIG *scaled);

+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
+
 static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
                                MV_REFERENCE_FRAME ref0,
                                MV_REFERENCE_FRAME ref1) {
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -33,7 +33,6 @@
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_variance.h"

@@ -56,14 +55,7 @@
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)

 #define MIN_KF_BOOST        300
-
-#if CONFIG_MULTIPLE_ARF
-// Set MIN_GF_INTERVAL to 1 for the full decomposition.
-#define MIN_GF_INTERVAL             2
-#else
-#define MIN_GF_INTERVAL             4
-#endif
-
+#define MIN_GF_INTERVAL     4
 #define LONG_TERM_VBR_CORRECTION

 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
@@ -137,14 +129,13 @@ static void output_stats(FIRSTPASS_STATS *stats,
    FILE *fpfile;
    fpfile = fopen("firstpass.stt", "a");

-    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
            stats->frame,
            stats->intra_error,
            stats->coded_error,
            stats->sr_coded_error,
-            stats->ssim_weighted_pred_err,
            stats->pcnt_inter,
            stats->pcnt_motion,
            stats->pcnt_second_ref,
@@ -498,6 +489,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
                                        &cpi->scaled_source);
  }

+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
  vp9_setup_src_planes(x, cpi->Source, 0, 0);
  vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
@@ -505,8 +498,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
  xd->mi = cm->mi_grid_visible;
  xd->mi[0] = cm->mi;

-  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
  vp9_frame_init_quantizer(cpi);

  for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -595,8 +586,9 @@ void vp9_first_pass(VP9_COMP *cpi) {

      // Other than for the first frame do a motion search.
      if (cm->current_video_frame > 0) {
-        int tmp_err, motion_error;
+        int tmp_err, motion_error, raw_motion_error;
        int_mv mv, tmp_mv;
+        struct buf_2d unscaled_last_source_buf_2d;

        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
        motion_error = get_prediction_error(bsize, &x->plane[0].src,
@@ -604,67 +596,83 @@ void vp9_first_pass(VP9_COMP *cpi) {
        // Assume 0,0 motion with no mv overhead.
        mv.as_int = tmp_mv.as_int = 0;

-        // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search.
-        first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
-                                 &motion_error);
-        if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-          vp9_clear_system_state();
-          motion_error = (int)(motion_error * error_weight);
-        }
+        // Compute the motion error of the 0,0 motion using the last source
+        // frame as the reference. Skip the further motion search on
+        // reconstructed frame if this error is small.
+        unscaled_last_source_buf_2d.buf =
+            cpi->unscaled_last_source->y_buffer + recon_yoffset;
+        unscaled_last_source_buf_2d.stride =
+            cpi->unscaled_last_source->y_stride;
+        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                &unscaled_last_source_buf_2d);

-        // If the current best reference mv is not centered on 0,0 then do a 0,0
-        // based search as well.
-        if (best_ref_mv.as_int) {
-          tmp_err = INT_MAX;
-          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
-                                   &tmp_err);
+        // TODO(pengchong): Replace the hard-coded threshold
+        if (raw_motion_error > 25 ||
+            (cpi->use_svc && cpi->svc.number_temporal_layers == 1)) {
+          // Test last reference frame using the previous best mv as the
+          // starting point (best reference) for the search.
+          first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
+                                   &motion_error);
          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
            vp9_clear_system_state();
-            tmp_err = (int)(tmp_err * error_weight);
+            motion_error = (int)(motion_error * error_weight);
          }

-          if (tmp_err < motion_error) {
-            motion_error = tmp_err;
-            mv.as_int = tmp_mv.as_int;
-          }
-        }
+          // If the current best reference mv is not centered on 0,0 then do a
+          // 0,0 based search as well.
+          if (best_ref_mv.as_int) {
+            tmp_err = INT_MAX;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err);
+            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+              vp9_clear_system_state();
+              tmp_err = (int)(tmp_err * error_weight);
+            }

-        // Search in an older reference frame.
-        if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
-          // Assume 0,0 motion with no mv overhead.
-          int gf_motion_error;
-
-          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                 &xd->plane[0].pre[0]);
-
-          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
-                                   &gf_motion_error);
-          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();
-            gf_motion_error = (int)(gf_motion_error * error_weight);
+            if (tmp_err < motion_error) {
+              motion_error = tmp_err;
+              mv.as_int = tmp_mv.as_int;
+            }
          }

-          if (gf_motion_error < motion_error && gf_motion_error < this_error)
-            ++second_ref_count;
+          // Search in an older reference frame.
+          if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+            // Assume 0,0 motion with no mv overhead.
+            int gf_motion_error;

-          // Reset to last frame as reference buffer.
-          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
-          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                   &xd->plane[0].pre[0]);

-          // In accumulating a score for the older reference frame take the
-          // best of the motion predicted score and the intra coded error
-          // (just as will be done for) accumulation of "coded_error" for
-          // the last frame.
-          if (gf_motion_error < this_error)
-            sr_coded_error += gf_motion_error;
-          else
-            sr_coded_error += this_error;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+                                     &gf_motion_error);
+            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+              vp9_clear_system_state();
+              gf_motion_error = (int)(gf_motion_error * error_weight);
+            }
+
+            if (gf_motion_error < motion_error && gf_motion_error < this_error)
+              ++second_ref_count;
+
+            // Reset to last frame as reference buffer.
+            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+            // In accumulating a score for the older reference frame take the
+            // best of the motion predicted score and the intra coded error
+            // (just as will be done for) accumulation of "coded_error" for
+            // the last frame.
+            if (gf_motion_error < this_error)
+              sr_coded_error += gf_motion_error;
+            else
+              sr_coded_error += this_error;
+          } else {
+            sr_coded_error += motion_error;
+          }
        } else {
          sr_coded_error += motion_error;
        }
+
        // Start by assuming that intra mode is best.
        best_ref_mv.as_int = 0;

@@ -905,7 +913,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
    }

    // Restriction on active max q for constrained quality mode.
-    if (cpi->oxcf.rc_mode == RC_MODE_CONSTRAINED_QUALITY)
+    if (cpi->oxcf.rc_mode == VPX_CQ)
      q = MAX(q, oxcf->cq_level);
    return q;
  }
@@ -1066,38 +1074,30 @@ static int detect_flash(const TWO_PASS *twopass, int offset) {
 }

 // Update the motion related elements to the GF arf boost calculation.
-static void accumulate_frame_motion_stats(
-  FIRSTPASS_STATS *this_frame,
-  double *this_frame_mv_in_out,
-  double *mv_in_out_accumulator,
-  double *abs_mv_in_out_accumulator,
-  double *mv_ratio_accumulator) {
-  double motion_pct;
-
-  // Accumulate motion stats.
-  motion_pct = this_frame->pcnt_motion;
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          double *mv_in_out,
+                                          double *mv_in_out_accumulator,
+                                          double *abs_mv_in_out_accumulator,
+                                          double *mv_ratio_accumulator) {
+  const double pct = stats->pcnt_motion;

  // Accumulate Motion In/Out of frame stats.
-  *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
-  *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
-  *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct);
+  *mv_in_out = stats->mv_in_out_count * pct;
+  *mv_in_out_accumulator += *mv_in_out;
+  *abs_mv_in_out_accumulator += fabs(*mv_in_out);

-  // Accumulate a measure of how uniform (or conversely how random)
-  // the motion field is (a ratio of absmv / mv).
-  if (motion_pct > 0.05) {
-    const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
-                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio = fabs(stats->mvr_abs) /
+                                 DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio = fabs(stats->mvc_abs) /
+                                 DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));

-    const double this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
-                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
-
-    *mv_ratio_accumulator += (this_frame_mvr_ratio < this_frame->mvr_abs)
-      ? (this_frame_mvr_ratio * motion_pct)
-      : this_frame->mvr_abs * motion_pct;
-
-    *mv_ratio_accumulator += (this_frame_mvc_ratio < this_frame->mvc_abs)
-      ? (this_frame_mvc_ratio * motion_pct)
-      : this_frame->mvc_abs * motion_pct;
+    *mv_ratio_accumulator += pct * (mvr_ratio < stats->mvr_abs ?
+                                       mvr_ratio : stats->mvr_abs);
+    *mv_ratio_accumulator += pct * (mvc_ratio < stats->mvc_abs ?
+                                       mvc_ratio : stats->mvc_abs);
  }
 }

@@ -1214,144 +1214,6 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
  return arf_boost;
 }

-#if CONFIG_MULTIPLE_ARF
-// Work out the frame coding order for a GF or an ARF group.
-// The current implementation codes frames in their natural order for a
-// GF group, and inserts additional ARFs into an ARF group using a
-// binary split approach.
-// NOTE: this function is currently implemented recursively.
-static void schedule_frames(VP9_COMP *cpi, const int start, const int end,
-                            const int arf_idx, const int gf_or_arf_group,
-                            const int level) {
-  int i, abs_end, half_range;
-  int *cfo = cpi->frame_coding_order;
-  int idx = cpi->new_frame_coding_order_period;
-
-  // If (end < 0) an ARF should be coded at position (-end).
-  assert(start >= 0);
-
-  // printf("start:%d end:%d\n", start, end);
-
-  // GF Group: code frames in logical order.
-  if (gf_or_arf_group == 0) {
-    assert(end >= start);
-    for (i = start; i <= end; ++i) {
-      cfo[idx] = i;
-      cpi->arf_buffer_idx[idx] = arf_idx;
-      cpi->arf_weight[idx] = -1;
-      ++idx;
-    }
-    cpi->new_frame_coding_order_period = idx;
-    return;
-  }
-
-  // ARF Group: Work out the ARF schedule and mark ARF frames as negative.
-  if (end < 0) {
-    // printf("start:%d end:%d\n", -end, -end);
-    // ARF frame is at the end of the range.
-    cfo[idx] = end;
-    // What ARF buffer does this ARF use as predictor.
-    cpi->arf_buffer_idx[idx] = (arf_idx > 2) ? (arf_idx - 1) : 2;
-    cpi->arf_weight[idx] = level;
-    ++idx;
-    abs_end = -end;
-  } else {
-    abs_end = end;
-  }
-
-  half_range = (abs_end - start) >> 1;
-
-  // ARFs may not be adjacent, they must be separated by at least
-  // MIN_GF_INTERVAL non-ARF frames.
-  if ((start + MIN_GF_INTERVAL) >= (abs_end - MIN_GF_INTERVAL)) {
-    // printf("start:%d end:%d\n", start, abs_end);
-    // Update the coding order and active ARF.
-    for (i = start; i <= abs_end; ++i) {
-      cfo[idx] = i;
-      cpi->arf_buffer_idx[idx] = arf_idx;
-      cpi->arf_weight[idx] = -1;
-      ++idx;
-    }
-    cpi->new_frame_coding_order_period = idx;
-  } else {
-    // Place a new ARF at the mid-point of the range.
-    cpi->new_frame_coding_order_period = idx;
-    schedule_frames(cpi, start, -(start + half_range), arf_idx + 1,
-                    gf_or_arf_group, level + 1);
-    schedule_frames(cpi, start + half_range + 1, abs_end, arf_idx,
-                    gf_or_arf_group, level + 1);
-  }
-}
-
-#define FIXED_ARF_GROUP_SIZE 16
-
-void define_fixed_arf_period(VP9_COMP *cpi) {
-  int i;
-  int max_level = INT_MIN;
-
-  assert(cpi->multi_arf_enabled);
-  assert(cpi->oxcf.lag_in_frames >= FIXED_ARF_GROUP_SIZE);
-
-  // Save the weight of the last frame in the sequence before next
-  // sequence pattern overwrites it.
-  cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
-  assert(cpi->this_frame_weight >= 0);
-
-  cpi->twopass.gf_zeromotion_pct = 0;
-
-  // Initialize frame coding order variables.
-  cpi->new_frame_coding_order_period = 0;
-  cpi->next_frame_in_order = 0;
-  cpi->arf_buffered = 0;
-  vp9_zero(cpi->frame_coding_order);
-  vp9_zero(cpi->arf_buffer_idx);
-  vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
-
-  if (cpi->rc.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {
-    // Setup a GF group close to the keyframe.
-    cpi->rc.source_alt_ref_pending = 0;
-    cpi->rc.baseline_gf_interval = cpi->rc.frames_to_key;
-    schedule_frames(cpi, 0, (cpi->rc.baseline_gf_interval - 1), 2, 0, 0);
-  } else {
-    // Setup a fixed period ARF group.
-    cpi->rc.source_alt_ref_pending = 1;
-    cpi->rc.baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
-    schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
-  }
-
-  // Replace level indicator of -1 with correct level.
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    if (cpi->arf_weight[i] > max_level) {
-      max_level = cpi->arf_weight[i];
-    }
-  }
-  ++max_level;
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    if (cpi->arf_weight[i] == -1) {
-      cpi->arf_weight[i] = max_level;
-    }
-  }
-  cpi->max_arf_level = max_level;
-#if 0
-  printf("\nSchedule: ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->frame_coding_order[i]);
-  }
-  printf("\n");
-  printf("ARFref:   ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->arf_buffer_idx[i]);
-  }
-  printf("\n");
-  printf("Weight:   ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->arf_weight[i]);
-  }
-  printf("\n");
-#endif
-}
-#endif
-
 // Calculate a section intra ratio used in setting max loop filter.
 static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
                                         const FIRSTPASS_STATS *end,
@@ -1421,6 +1283,18 @@ static int calculate_boost_bits(int frame_count,
  return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
 }

+// Current limit on maximum number of active arfs in a GF/ARF group.
+#define MAX_ACTIVE_ARFS 2
+#define ARF_SLOT1 2
+#define ARF_SLOT2 3
+// This function indirects the choice of buffers for arfs.
+// At the moment the values are fixed but this may change as part of
+// the integration process with other codec features that swap buffers around.
+static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
+  arf_buffer_indices[0] = ARF_SLOT1;
+  arf_buffer_indices[1] = ARF_SLOT2;
+}
+
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                   double group_error, int gf_arf_bits) {
  RATE_CONTROL *const rc = &cpi->rc;
@@ -1428,42 +1302,85 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
  TWO_PASS *twopass = &cpi->twopass;
  FIRSTPASS_STATS frame_stats;
  int i;
-  int group_frame_index = 1;
+  int frame_index = 1;
  int target_frame_size;
  int key_frame;
  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
  int64_t total_group_bits = gf_group_bits;
  double modified_err = 0.0;
  double err_fraction;
+  int mid_boost_bits = 0;
+  int mid_frame_idx;
+  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];

  key_frame = cpi->common.frame_type == KEY_FRAME ||
              vp9_is_upper_layer_key_frame(cpi);

+  get_arf_buffer_indices(arf_buffer_indices);
+
  // For key frames the frame target rate is already set and it
  // is also the golden frame.
-  // NOTE: We dont bother to check for the special case of ARF overlay
-  // frames here, as there is clamping code for this in the function
-  // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
-  // encodes.
  if (!key_frame) {
-    twopass->gf_group_bit_allocation[0] = gf_arf_bits;
+    if (rc->source_alt_ref_active) {
+      twopass->gf_group.update_type[0] = OVERLAY_UPDATE;
+      twopass->gf_group.rf_level[0] = INTER_NORMAL;
+      twopass->gf_group.bit_allocation[0] = 0;
+      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
+      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+    } else {
+      twopass->gf_group.update_type[0] = GF_UPDATE;
+      twopass->gf_group.rf_level[0] = GF_ARF_STD;
+      twopass->gf_group.bit_allocation[0] = gf_arf_bits;
+      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
+      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+    }

    // Step over the golden frame / overlay frame
    if (EOF == input_stats(twopass, &frame_stats))
      return;
  }

-  // Store the bits to spend on the ARF if there is one.
-  if (rc->source_alt_ref_pending) {
-    twopass->gf_group_bit_allocation[group_frame_index++] = gf_arf_bits;
-  }
-
-  // Deduct the boost bits for arf or gf if it is not a key frame.
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
  if (rc->source_alt_ref_pending || !key_frame)
    total_group_bits -= gf_arf_bits;

+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    if (cpi->multi_arf_enabled) {
+      // A portion of the gf / arf extra bits are set asside for lower level
+      // boosted frames in the middle of the group.
+      mid_boost_bits += gf_arf_bits >> 5;
+      gf_arf_bits -= (gf_arf_bits >> 5);
+    }
+
+    twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
+    twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits;
+    twopass->gf_group.arf_src_offset[frame_index] =
+      (unsigned char)(rc->baseline_gf_interval - 1);
+    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
+    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+    ++frame_index;
+
+    if (cpi->multi_arf_enabled) {
+      // Set aside a slot for a level 1 arf.
+      twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
+      twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW;
+      twopass->gf_group.arf_src_offset[frame_index] =
+        (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
+      twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1];
+      twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      ++frame_index;
+    }
+  }
+
+  // Define middle frame
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+
  // Allocate bits to the other frames in the group.
  for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+    int arf_idx = 0;
    if (EOF == input_stats(twopass, &frame_stats))
      break;

@@ -1475,10 +1392,48 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
      err_fraction = 0.0;

    target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      mid_boost_bits += (target_frame_size >> 4);
+      target_frame_size -= (target_frame_size >> 4);
+
+      if (frame_index <= mid_frame_idx)
+        arf_idx = 1;
+    }
+    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+
    target_frame_size = clamp(target_frame_size, 0,
                              MIN(max_bits, (int)total_group_bits));

-    twopass->gf_group_bit_allocation[group_frame_index++] = target_frame_size;
+    twopass->gf_group.update_type[frame_index] = LF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+
+    twopass->gf_group.bit_allocation[frame_index] = target_frame_size;
+    ++frame_index;
+  }
+
+  // Note:
+  // We need to configure the frame at the end of the sequence + 1 that will be
+  // the start frame for the next group. Otherwise prior to the call to
+  // vp9_rc_get_second_pass_params() the data will be undefined.
+  twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
+  twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+
+  if (rc->source_alt_ref_pending) {
+    twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+
+    // Final setup for second arf and its overlay.
+    if (cpi->multi_arf_enabled) {
+      twopass->gf_group.bit_allocation[2] =
+        twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
+      twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
+    }
+  } else {
+    twopass->gf_group.update_type[frame_index] = GF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
  }
 }

@@ -1508,7 +1463,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  double mv_in_out_accumulator = 0.0;
  double abs_mv_in_out_accumulator = 0.0;
  double mv_ratio_accumulator_thresh;
-  unsigned int allow_alt_ref = oxcf->play_alternate && oxcf->lag_in_frames;
+  unsigned int allow_alt_ref = is_altref_enabled(oxcf);

  int f_boost = 0;
  int b_boost = 0;
@@ -1521,8 +1476,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  // Reset the GF group data structures unless this is a key
  // frame in which case it will already have been done.
  if (cpi->common.frame_type != KEY_FRAME) {
-    twopass->gf_group_index = 0;
-    vp9_zero(twopass->gf_group_bit_allocation);
+    vp9_zero(twopass->gf_group);
  }

  vp9_clear_system_state();
@@ -1644,24 +1598,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    }
  }

-#if CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled) {
-    // Initialize frame coding order variables.
-    cpi->new_frame_coding_order_period = 0;
-    cpi->next_frame_in_order = 0;
-    cpi->arf_buffered = 0;
-    vp9_zero(cpi->frame_coding_order);
-    vp9_zero(cpi->arf_buffer_idx);
-    vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
-  }
-#endif
-
  // Set the interval until the next gf.
  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
    rc->baseline_gf_interval = i - 1;
  else
    rc->baseline_gf_interval = i;

+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
  // Should we use the alternate reference frame.
  if (allow_alt_ref &&
      (i < cpi->oxcf.lag_in_frames) &&
@@ -1674,62 +1618,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                   &b_boost);
    rc->source_alt_ref_pending = 1;

-#if CONFIG_MULTIPLE_ARF
-    // Set the ARF schedule.
-    if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0);
-    }
-#endif
  } else {
    rc->gfu_boost = (int)boost_score;
    rc->source_alt_ref_pending = 0;
-#if CONFIG_MULTIPLE_ARF
-    // Set the GF schedule.
-    if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0);
-      assert(cpi->new_frame_coding_order_period ==
-             rc->baseline_gf_interval);
-    }
-#endif
  }

-#if CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled && (cpi->common.frame_type != KEY_FRAME)) {
-    int max_level = INT_MIN;
-    // Replace level indicator of -1 with correct level.
-    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
-      if (cpi->arf_weight[i] > max_level) {
-        max_level = cpi->arf_weight[i];
-      }
-    }
-    ++max_level;
-    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
-      if (cpi->arf_weight[i] == -1) {
-        cpi->arf_weight[i] = max_level;
-      }
-    }
-    cpi->max_arf_level = max_level;
-  }
-#if 0
-  if (cpi->multi_arf_enabled) {
-    printf("\nSchedule: ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->frame_coding_order[i]);
-    }
-    printf("\n");
-    printf("ARFref:   ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->arf_buffer_idx[i]);
-    }
-    printf("\n");
-    printf("Weight:   ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->arf_weight[i]);
-    }
-    printf("\n");
-  }
-#endif
-#endif
  // Reset the file position.
  reset_fpf_position(twopass, start_pos);

@@ -1879,8 +1772,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  cpi->common.frame_type = KEY_FRAME;

  // Reset the GF group data structures.
-  twopass->gf_group_index = 0;
-  vp9_zero(twopass->gf_group_bit_allocation);
+  vp9_zero(twopass->gf_group);

  // Is this a forced key frame by interval.
  rc->this_key_frame_forced = rc->next_key_frame_forced;
@@ -2071,7 +1963,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  twopass->kf_group_bits -= kf_bits;

  // Save the bits to spend on the key frame.
-  twopass->gf_group_bit_allocation[0] = kf_bits;
+  twopass->gf_group.bit_allocation[0] = kf_bits;
+  twopass->gf_group.update_type[0] = KF_UPDATE;
+  twopass->gf_group.rf_level[0] = KF_STD;

  // Note the total error score of the kf group minus the key frame itself.
  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2099,6 +1993,44 @@ void vbr_rate_correction(int * this_frame_target,
  }
 }

+// Define the reference buffers that will be updated post encode.
+void configure_buffer_updates(VP9_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case GF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
@@ -2123,19 +2055,17 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
  if (!twopass->stats_in)
    return;

-  // Increment the gf group index.
-  ++twopass->gf_group_index;
-
  // If this is an arf frame then we dont want to read the stats file or
  // advance the input pointer as we already have what we need.
-  if (cpi->refresh_alt_ref_frame) {
+  if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) {
    int target_rate;
-    target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+    configure_buffer_updates(cpi);
+    target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
    rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
    // Correction to rate target based on prior over or under shoot.
-    if (cpi->oxcf.rc_mode == RC_MODE_VBR)
+    if (cpi->oxcf.rc_mode == VPX_VBR)
      vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
 #endif
    vp9_rc_set_frame_target(cpi, target_rate);
@@ -2150,7 +2080,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
    twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
  }

-  if (cpi->oxcf.rc_mode == RC_MODE_CONSTANT_QUALITY) {
+  if (cpi->oxcf.rc_mode == VPX_Q) {
    twopass->active_worst_quality = cpi->oxcf.cq_level;
  } else if (cm->current_video_frame == 0 ||
             (is_spatial_svc && lc->current_video_frame_in_layer == 0)) {
@@ -2194,15 +2124,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {

  // Define a new GF/ARF group. (Should always enter here for key frames).
  if (rc->frames_till_gf_update_due == 0) {
-#if CONFIG_MULTIPLE_ARF
-    if (cpi->multi_arf_enabled) {
-      define_fixed_arf_period(cpi);
-    } else {
-#endif
-      define_gf_group(cpi, &this_frame_copy);
-#if CONFIG_MULTIPLE_ARF
-    }
-#endif
+    define_gf_group(cpi, &this_frame_copy);

    if (twopass->gf_zeromotion_pct > 995) {
      // As long as max_thresh for encode breakout is small enough, it is ok
@@ -2226,7 +2148,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
    }
  }

-  target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+  configure_buffer_updates(cpi);
+
+  target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
  if (cpi->common.frame_type == KEY_FRAME)
    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
  else
@@ -2235,7 +2159,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
  rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
  // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == RC_MODE_VBR)
+  if (cpi->oxcf.rc_mode == VPX_VBR)
    vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
 #endif
  vp9_rc_set_frame_target(cpi, target_rate);
@@ -2289,4 +2213,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
    twopass->kf_group_bits -= bits_used;
  }
  twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
+
+  // Increment the gf group index ready for the next frame.
+  ++twopass->gf_group.index;
 }
--- a/Show More
+++ b/Show More