Cosmetic changes in the supertx expt

Converts most negative !CONFIG_SUPERTX checks to positive ones. Change-Id: I80b7f8c5d3483a7861f0de7fc7ebc425b9c68766
Fixing rd loop bugs in supertx+ext_tx experiment
2014-11-10 17:34:00 -08:00 · 2014-11-07 16:43:12 -08:00 · 2014-10-20 21:52:55 -07:00 · 2014-10-20 14:51:20 -07:00 · 2014-10-14 15:59:17 -07:00 · 2014-09-03 11:22:36 -07:00
103 changed files with 8576 additions and 2855 deletions
--- a/1
+++ b/1
@@ -55,7 +55,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv6-linux-rvct
    armv6-linux-gcc
    armv6-none-rvct
-    arm64-darwin-gcc
    armv7-android-gcc
    armv7-darwin-gcc
    armv7-linux-rvct
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -799,7 +799,7 @@ process_common_toolchain() {
    arm*)
        # on arm, isa versions are supersets
        case ${tgt_isa} in
-        arm64|armv8)
+        armv8)
            soft_enable neon
            ;;
        armv7|armv7s)
@@ -1048,6 +1048,14 @@ EOF
        esac
    ;;
    x86*)
+        bits=32
+        enabled x86_64 && bits=64
+        check_cpp <<EOF && bits=x32
+#ifndef __ILP32__
+#error "not x32"
+#endif
+EOF
+
        case  ${tgt_os} in
            win*)
                enabled gcc && add_cflags -fno-common
@@ -1086,6 +1094,8 @@ EOF
                esac
            ;;
            gcc*)
+                add_cflags -m${bits}
+                add_ldflags -m${bits}
                link_with_cc=gcc
                tune_cflags="-march="
                setup_gnu_toolchain
@@ -1110,20 +1120,6 @@ EOF
            ;;
        esac

-        bits=32
-        enabled x86_64 && bits=64
-        check_cpp <<EOF && bits=x32
-#ifndef __ILP32__
-#error "not x32"
-#endif
-EOF
-        case ${tgt_cc} in
-            gcc*)
-                add_cflags -m${bits}
-                add_ldflags -m${bits}
-            ;;
-        esac
-
        soft_enable runtime_cpu_detect
        # We can't use 'check_cflags' until the compiler is configured and CC is
        # populated.
@@ -1226,12 +1222,10 @@ EOF
        fi
    fi

-    tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
-    # Default use_x86inc to yes when we are 64 bit, non-pic, or on any
-    # non-Darwin target.
-    if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \
-            [ "${tgt_os_no_version}" != "darwin" ]; then
-        soft_enable use_x86inc
+    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o \
+         "${tgt_os#darwin}" = "${tgt_os}"  ]; then
+      soft_enable use_x86inc
    fi

    # Position Independent Code (PIC) support, for building relocatable
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -25,8 +25,7 @@ MAKE_JOBS=1
 LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
-         armv6-darwin-gcc
+TARGETS="armv6-darwin-gcc
         armv7-darwin-gcc
         armv7s-darwin-gcc
         x86-iphonesimulator-gcc
@@ -55,9 +54,6 @@ build_target() {
 target_to_preproc_symbol() {
  target="$1"
  case "${target}" in
-    arm64-*)
-      echo "__aarch64__"
-      ;;
    armv6-*)
      echo "__ARM_ARCH_6__"
      ;;
--- a/9
+++ b/9
@@ -96,7 +96,6 @@ all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
-all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -274,7 +273,13 @@ EXPERIMENT_LIST="
    multiple_arf
    spatial_svc
    denoising
-    fp_mb_stats
+    masked_interinter
+    interintra
+    masked_interintra
+    filterintra
+    ext_tx
+    supertx
+    copy_coding
 "
 CONFIG_LIST="
    external_build
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -28,6 +28,16 @@
 #include "vpx/vpx_encoder.h"
 #include "./vpxstats.h"

+static const struct arg_enum_list encoding_mode_enum[] = {
+  {"i", INTER_LAYER_PREDICTION_I},
+  {"alt-ip", ALT_INTER_LAYER_PREDICTION_IP},
+  {"ip", INTER_LAYER_PREDICTION_IP},
+  {"gf", USE_GOLDEN_FRAME},
+  {NULL, 0}
+};
+
+static const arg_def_t encoding_mode_arg = ARG_DEF_ENUM(
+    "m", "encoding-mode", 1, "Encoding mode algorithm", encoding_mode_enum);
 static const arg_def_t skip_frames_arg =
    ARG_DEF("s", "skip-frames", 1, "input frames to skip");
 static const arg_def_t frames_arg =
@@ -48,6 +58,9 @@ static const arg_def_t quantizers_arg =
    ARG_DEF("q", "quantizers", 1, "quantizers for non key frames, also will "
            "be applied to key frames if -qn is not specified (lowest to "
            "highest layer)");
+static const arg_def_t quantizers_keyframe_arg =
+    ARG_DEF("qn", "quantizers-keyframe", 1, "quantizers for key frames (lowest "
+        "to highest layer)");
 static const arg_def_t passes_arg =
    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
 static const arg_def_t pass_arg =
@@ -64,13 +77,16 @@ static const arg_def_t max_bitrate_arg =
    ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");

 static const arg_def_t *svc_args[] = {
-  &frames_arg,        &width_arg,         &height_arg,
+  &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  &passes_arg,
-  &pass_arg,          &fpf_name_arg,      &min_q_arg,       &max_q_arg,
-  &min_bitrate_arg,   &max_bitrate_arg,   NULL
+  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
+  &quantizers_keyframe_arg,               &passes_arg,      &pass_arg,
+  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
+  &max_bitrate_arg,   NULL
 };

+static const SVC_ENCODING_MODE default_encoding_mode =
+    INTER_LAYER_PREDICTION_IP;
 static const uint32_t default_frames_to_skip = 0;
 static const uint32_t default_frames_to_code = 60 * 60;
 static const uint32_t default_width = 1920;
@@ -119,6 +135,7 @@ static void parse_command_line(int argc, const char **argv_,
  // initialize SvcContext with parameters that will be passed to vpx_svc_init
  svc_ctx->log_level = SVC_LOG_DEBUG;
  svc_ctx->spatial_layers = default_spatial_layers;
+  svc_ctx->encoding_mode = default_encoding_mode;

  // start with default encoder configuration
  res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -144,7 +161,9 @@ static void parse_command_line(int argc, const char **argv_,
  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
    arg.argv_step = 1;

-    if (arg_match(&arg, &frames_arg, argi)) {
+    if (arg_match(&arg, &encoding_mode_arg, argi)) {
+      svc_ctx->encoding_mode = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &frames_arg, argi)) {
      app_input->frames_to_code = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &width_arg, argi)) {
      enc_cfg->g_w = arg_parse_uint(&arg);
@@ -164,7 +183,9 @@ static void parse_command_line(int argc, const char **argv_,
    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
      vpx_svc_set_scale_factors(svc_ctx, arg.val);
    } else if (arg_match(&arg, &quantizers_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val);
+      vpx_svc_set_quantizers(svc_ctx, arg.val, 0);
+    } else if (arg_match(&arg, &quantizers_keyframe_arg, argi)) {
+      vpx_svc_set_quantizers(svc_ctx, arg.val, 1);
    } else if (arg_match(&arg, &passes_arg, argi)) {
      passes = arg_parse_uint(&arg);
      if (passes < 1 || passes > 2) {
@@ -249,12 +270,12 @@ static void parse_command_line(int argc, const char **argv_,

  printf(
      "Codec %s\nframes: %d, skip: %d\n"
-      "layers: %d\n"
+      "mode: %d, layers: %d\n"
      "width %d, height: %d,\n"
      "num: %d, den: %d, bitrate: %d,\n"
      "gop size: %d\n",
      vpx_codec_iface_name(vpx_codec_vp9_cx()), app_input->frames_to_code,
-      app_input->frames_to_skip,
+      app_input->frames_to_skip, svc_ctx->encoding_mode,
      svc_ctx->spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
--- a/libs.mk
+++ b/libs.mk
@@ -170,7 +170,7 @@ CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+CODEC_SRCS-$(BUILD_LIBVPX) += third_party/x86inc/x86inc.asm
 endif
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -14,49 +14,30 @@
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
-#include "test/y4m_video_source.h"

 namespace {

-const int kMaxPSNR = 100;
-
 class CpuSpeedTest : public ::libvpx_test::EncoderTest,
    public ::libvpx_test::CodecTestWith2Params<
        libvpx_test::TestMode, int> {
 protected:
-  CpuSpeedTest()
-      : EncoderTest(GET_PARAM(0)),
-        encoding_mode_(GET_PARAM(1)),
-        set_cpu_used_(GET_PARAM(2)),
-        min_psnr_(kMaxPSNR) {}
+  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
  virtual ~CpuSpeedTest() {}

  virtual void SetUp() {
    InitializeConfig();
-    SetMode(encoding_mode_);
-    if (encoding_mode_ != ::libvpx_test::kRealTime) {
-      cfg_.g_lag_in_frames = 25;
-      cfg_.rc_end_usage = VPX_VBR;
-    } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = VPX_CBR;
-    }
-  }
-
-  virtual void BeginPassHook(unsigned int /*pass*/) {
-    min_psnr_ = kMaxPSNR;
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
  }

  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      if (encoding_mode_ != ::libvpx_test::kRealTime) {
-        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
-        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
-        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
-        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
-      }
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
    }
  }

@@ -64,15 +45,7 @@ class CpuSpeedTest : public ::libvpx_test::EncoderTest,
    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
    }
  }
-
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
-    if (pkt->data.psnr.psnr[0] < min_psnr_)
-      min_psnr_ = pkt->data.psnr.psnr[0];
-  }
-
-  ::libvpx_test::TestMode encoding_mode_;
  int set_cpu_used_;
-  double min_psnr_;
 };

 TEST_P(CpuSpeedTest, TestQ0) {
@@ -80,6 +53,7 @@ TEST_P(CpuSpeedTest, TestQ0) {
  // without a mismatch when passing in a very low max q.  This pushes
  // the encoder to producing lots of big partitions which will likely
  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 400;
@@ -89,32 +63,16 @@ TEST_P(CpuSpeedTest, TestQ0) {
  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
                                       20);

-  init_flags_ = VPX_CODEC_USE_PSNR;
-
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_GE(min_psnr_, kMaxPSNR);
 }

-TEST_P(CpuSpeedTest, TestScreencastQ0) {
-  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
-  cfg_.g_timebase = video.timebase();
-  cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
-  cfg_.rc_target_bitrate = 400;
-  cfg_.rc_max_quantizer = 0;
-  cfg_.rc_min_quantizer = 0;
-
-  init_flags_ = VPX_CODEC_USE_PSNR;
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_GE(min_psnr_, kMaxPSNR);
-}

 TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
  // Validate that this non multiple of 64 wide clip encodes and decodes
  // without a mismatch when passing in a very low max q.  This pushes
  // the encoder to producing lots of big partitions which will likely
  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 12000;
@@ -131,6 +89,7 @@ TEST_P(CpuSpeedTest, TestLowBitrate) {
  // when passing in a very high min q.  This pushes the encoder to producing
  // lots of small partitions which might will test the other condition.

+  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 200;
@@ -149,7 +108,6 @@ using std::tr1::make_tuple;

 VP9_INSTANTIATE_TEST_CASE(
    CpuSpeedTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-                      ::libvpx_test::kRealTime),
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
    ::testing::Range(0, 8));
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -576,7 +576,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
    // Expect some frame drops in this test: for this 200 frames test,
    // expect at least 10% and not more than 60% drops.
    ASSERT_GE(num_drops_, 20);
-    ASSERT_LE(num_drops_, 130);
+    ASSERT_LE(num_drops_, 120);
  }
 }

--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -97,9 +97,6 @@ const char *const kVP9InvalidFileTests[] = {
  "invalid-vp90-01.webm",
  "invalid-vp90-02.webm",
  "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf",
-  "invalid-vp90-03.webm",
-  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf",
-  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf",
 };

 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -28,11 +28,10 @@ class MD5 {
      // plane, we never want to round down and thus skip a pixel so if
      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
      // This works only for chroma_shift of 0 and 1.
-      const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
                    img->y_chroma_shift : img->d_h;
-      const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
-                     img->x_chroma_shift : img->d_w) * bytes_per_sample;
+      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
+                    img->x_chroma_shift : img->d_w;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -41,6 +41,7 @@ class SvcTest : public ::testing::Test {
  virtual ~SvcTest() {}

  virtual void SetUp() {
+    svc_.encoding_mode = INTER_LAYER_PREDICTION_IP;
    svc_.log_level = SVC_LOG_DEBUG;
    svc_.log_print = 0;

@@ -130,13 +131,22 @@ TEST_F(SvcTest, SetLayersOption) {
  EXPECT_EQ(3, svc_.spatial_layers);
 }

+TEST_F(SvcTest, SetEncodingMode) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+  EXPECT_EQ(ALT_INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
+}
+
 TEST_F(SvcTest, SetMultipleOptions) {
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_OK, res);
  codec_initialized_ = true;
  EXPECT_EQ(2, svc_.spatial_layers);
+  EXPECT_EQ(INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
 }

 TEST_F(SvcTest, SetScaleFactorsOption) {
@@ -167,20 +177,48 @@ TEST_F(SvcTest, SetQuantizersOption) {
  codec_initialized_ = true;
 }

-TEST_F(SvcTest, SetQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
+TEST_F(SvcTest, SetKeyFrameQuantizersOption) {
  svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40");
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_,
+                                       "quantizers-keyframe=not-quantizers");
  EXPECT_EQ(VPX_CODEC_OK, res);
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);

-  res = vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_options(&svc_, "quantizers-keyframe=40,45");
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetQuantizers) {
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30", 0);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, NULL, 0);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_quantizers(&svc_, "40", 0);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetKeyFrameQuantizers) {
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,31", 1);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, NULL, 1);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, "40,30", 1);
  EXPECT_EQ(VPX_CODEC_OK, res);
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_OK, res);
@@ -211,7 +249,7 @@ TEST_F(SvcTest, SetScaleFactors) {
 TEST_F(SvcTest, FirstFrameHasLayers) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -246,7 +284,7 @@ TEST_F(SvcTest, FirstFrameHasLayers) {
 TEST_F(SvcTest, EncodeThreeFrames) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);
  int decoded_frames = 0;
  vpx_codec_err_t res_dec;
  int frame_size;
@@ -322,7 +360,7 @@ TEST_F(SvcTest, EncodeThreeFrames) {
 TEST_F(SvcTest, GetLayerResolution) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -361,7 +399,7 @@ TEST_F(SvcTest, TwoPassEncode) {
  svc_.spatial_layers = 2;
  codec_enc_.g_pass = VPX_RC_FIRST_PASS;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -4,17 +4,6 @@ fe346136b9b8c1e6f6084cc106485706915795e4  invalid-vp90-01.webm
 25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02.webm
 2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
-df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03.webm
-8fe6fd82bf537340f586f97a7ae31fb37ccda302  invalid-vp90-03.webm.res
-a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
-0b194cc312c3a2e84d156a221b0a5eb615dfddc5  park_joy_90p_10_422.y4m
-ff0e0a21dc2adc95b8c1b37902713700655ced17  park_joy_90p_10_444.y4m
-614c32ae1eca391e867c70d19974f0d62664dd99  park_joy_90p_12_420.y4m
-c92825f1ea25c5c37855083a69faac6ac4641a9e  park_joy_90p_12_422.y4m
-b592189b885b6cc85db55cc98512a197d73d3b34  park_joy_90p_12_444.y4m
-4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c  park_joy_90p_8_420.y4m
-7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947  park_joy_90p_8_422.y4m
-bdb7856e6bc93599bdda05c2e773a9f22b6c6d03  park_joy_90p_8_444.y4m
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
@@ -544,6 +533,8 @@ b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
 7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
 bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
 f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
+495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
+65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
 0c83a1e414fde3bccd6dc451bbaee68e59974c76  vp90-2-07-frame_parallel.webm
 e5c2c9fb383e5bf3b563480adaeba5b7e3475ecd  vp90-2-07-frame_parallel.webm.md5
 086c7edcffd699ae7d99d710fd7e53b18910ca5b  vp90-2-08-tile_1x2_frame_parallel.webm
@@ -653,11 +644,5 @@ e615575ded499ea1d992f3b38e3baa434509cdcd  vp90-2-15-segkey.webm
 e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
 9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
 8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
-0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
-367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
 76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
-83f50908c8dc0ef8760595447a2ff7727489542e  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
-456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
-c123d1f9f02fb4143abb5e271916e3a3080de8f6  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
-456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
--- a/test/test.mk
+++ b/test/test.mk
@@ -15,7 +15,7 @@ LIBVPX_TEST_SRCS-yes += video_source.h
 ##
 ## Black box tests only use the public API.
 ##
-LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
@@ -42,9 +42,6 @@ LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h

-## Y4m parsing.
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h
-
 ## WebM Parsing
 ifeq ($(CONFIG_WEBM_IO), yes)
 LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
@@ -137,19 +134,7 @@ endif # CONFIG_SHARED
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
-
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
-
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
@@ -723,6 +708,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
@@ -771,22 +758,14 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5

 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res

 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -165,7 +165,7 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
  "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
  "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
-  "vp90-2-13-largescaling.webm",
+  "vp90-2-13-largescaling.webm", "vp91-2-04-yv444.webm",
  "vp90-2-14-resize-fp-tiles-1-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",
@@ -180,7 +180,6 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
-  "vp91-2-04-yuv444.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -78,7 +78,7 @@ string DecodeFile(const string &filename) {
        ref.idx = rnd.Rand8() % 3;
        decoder.Control(VP9_GET_REFERENCE, &ref);

-        CheckUserPrivateData(ref.img.user_priv, NULL);
+        CheckUserPrivateData(ref.img.user_priv, &frame_num);
      }
      md5.Add(img);
    }
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -50,15 +50,6 @@ static FILE *OpenTestDataFile(const std::string& file_name) {
  return fopen(path_to_source.c_str(), "rb");
 }

-static FILE *OpenTestOutFile(const std::string& file_name) {
-  const std::string path_to_source = GetDataPath() + "/" + file_name;
-  return fopen(path_to_source.c_str(), "wb");
-}
-
-static FILE *OpenTempOutFile() {
-  return tmpfile();
-}
-
 // Abstract base class for test video sources, which provide a stream of
 // vpx_image_t images with associated timestamps and duration.
 class VideoSource {
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -36,17 +36,6 @@ class LosslessTestLarge : public ::libvpx_test::EncoderTest,
    SetMode(encoding_mode_);
  }

-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-      // Only call Control if quantizer > 0 to verify that using quantizer
-      // alone will activate lossless
-      if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
-        encoder->Control(VP9E_SET_LOSSLESS, 1);
-      }
-    }
-  }
-
  virtual void BeginPassHook(unsigned int /*pass*/) {
    psnr_ = kMaxPsnr;
    nframes_ = 0;
@@ -102,24 +91,5 @@ TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
  EXPECT_GE(psnr_lossless, kMaxPsnr);
 }

-TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
-  const vpx_rational timebase = { 33333333, 1000000000 };
-  cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 25;
-  // Intentionally set Q > 0, to make sure control can be used to activate
-  // lossless
-  cfg_.rc_min_quantizer = 10;
-  cfg_.rc_max_quantizer = 20;
-
-  init_flags_ = VPX_CODEC_USE_PSNR;
-
-  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     timebase.den, timebase.num, 0, 10);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  const double psnr_lossless = GetMinPsnr();
-  EXPECT_GE(psnr_lossless, kMaxPsnr);
-}
-
 VP9_INSTANTIATE_TEST_CASE(LosslessTestLarge, ALL_TEST_MODES);
 }  // namespace
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -28,11 +28,11 @@ class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> {
 protected:
  virtual ~VP9WorkerThreadTest() {}
  virtual void SetUp() {
-    vp9_get_worker_interface()->init(&worker_);
+    vp9_worker_init(&worker_);
  }

  virtual void TearDown() {
-    vp9_get_worker_interface()->end(&worker_);
+    vp9_worker_end(&worker_);
  }

  VP9Worker worker_;
@@ -45,11 +45,10 @@ int ThreadHook(void* data, void* return_value) {
 }

 TEST_P(VP9WorkerThreadTest, HookSuccess) {
-  // should be a no-op.
-  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+  EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.

  for (int i = 0; i < 2; ++i) {
-    EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+    EXPECT_NE(vp9_worker_reset(&worker_), 0);

    int hook_data = 0;
    int return_value = 1;  // return successfully from the hook
@@ -59,21 +58,20 @@ TEST_P(VP9WorkerThreadTest, HookSuccess) {

    const bool synchronous = GetParam();
    if (synchronous) {
-      vp9_get_worker_interface()->execute(&worker_);
+      vp9_worker_execute(&worker_);
    } else {
-      vp9_get_worker_interface()->launch(&worker_);
+      vp9_worker_launch(&worker_);
    }
-    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+    EXPECT_NE(vp9_worker_sync(&worker_), 0);
    EXPECT_FALSE(worker_.had_error);
    EXPECT_EQ(5, hook_data);

-    // should be a no-op.
-    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+    EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.
  }
 }

 TEST_P(VP9WorkerThreadTest, HookFailure) {
-  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+  EXPECT_NE(vp9_worker_reset(&worker_), 0);

  int hook_data = 0;
  int return_value = 0;  // return failure from the hook
@@ -83,49 +81,26 @@ TEST_P(VP9WorkerThreadTest, HookFailure) {

  const bool synchronous = GetParam();
  if (synchronous) {
-    vp9_get_worker_interface()->execute(&worker_);
+    vp9_worker_execute(&worker_);
  } else {
-    vp9_get_worker_interface()->launch(&worker_);
+    vp9_worker_launch(&worker_);
  }
-  EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_));
+  EXPECT_FALSE(vp9_worker_sync(&worker_));
  EXPECT_EQ(1, worker_.had_error);

  // Ensure _reset() clears the error and _launch() can be called again.
  return_value = 1;
-  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+  EXPECT_NE(vp9_worker_reset(&worker_), 0);
  EXPECT_FALSE(worker_.had_error);
-  vp9_get_worker_interface()->launch(&worker_);
-  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+  vp9_worker_launch(&worker_);
+  EXPECT_NE(vp9_worker_sync(&worker_), 0);
  EXPECT_FALSE(worker_.had_error);
 }

-TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
-  EXPECT_EQ(0, vp9_set_worker_interface(NULL));
-  EXPECT_TRUE(vp9_get_worker_interface() != NULL);
-  for (int i = 0; i < 6; ++i) {
-    VP9WorkerInterface winterface = *vp9_get_worker_interface();
-    switch (i) {
-      default:
-      case 0: winterface.init = NULL; break;
-      case 1: winterface.reset = NULL; break;
-      case 2: winterface.sync = NULL; break;
-      case 3: winterface.launch = NULL; break;
-      case 4: winterface.execute = NULL; break;
-      case 5: winterface.end = NULL; break;
-    }
-    EXPECT_EQ(0, vp9_set_worker_interface(&winterface));
-  }
-}
-
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests

 #if CONFIG_WEBM_IO
-struct FileList {
-  const char *name;
-  const char *expected_md5;
-};
-
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string& filename, int num_threads) {
  libvpx_test::WebMVideoSource video(filename);
@@ -155,77 +130,39 @@ string DecodeFile(const string& filename, int num_threads) {
  return string(md5.Get());
 }

-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
-          << "threads = " << t;
-    }
-  }
-}
-
-// Trivial serialized thread worker interface implementation.
-// Note any worker that requires synchronization between other workers will
-// hang.
-namespace impl {
-
-void Init(VP9Worker *const worker) { memset(worker, 0, sizeof(*worker)); }
-int Reset(VP9Worker *const /*worker*/) { return 1; }
-int Sync(VP9Worker *const worker) { return !worker->had_error; }
-
-void Execute(VP9Worker *const worker) {
-  worker->had_error |= worker->hook(worker->data1, worker->data2);
-}
-
-void Launch(VP9Worker *const worker) { Execute(worker); }
-void End(VP9Worker *const /*worker*/) {}
-
-}  // namespace impl
-
-TEST(VP9WorkerThreadTest, TestSerialInterface) {
-  static const VP9WorkerInterface serial_interface = {
-    impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
-  };
-  // TODO(jzern): Avoid using a file that will use the row-based thread
-  // loopfilter, with the simple serialized implementation it will hang. This is
-  // due to its expectation that rows will be run in parallel as they wait on
-  // progress in the row above before proceeding.
-  static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
-  static const char filename[] = "vp90-2-03-size-226x226.webm";
-  VP9WorkerInterface default_interface = *vp9_get_worker_interface();
-
-  EXPECT_NE(vp9_set_worker_interface(&serial_interface), 0);
-  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
-
-  // Reset the interface.
-  EXPECT_NE(vp9_set_worker_interface(&default_interface), 0);
-  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
-}
-
-TEST(VP9DecodeMultiThreadedTest, Decode) {
+TEST(VP9DecodeMTTest, MTDecode) {
  // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
-            DecodeFile("vp90-2-03-size-226x226.webm", 2));
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("vp90-2-03-size-226x226.webm", 2).c_str());
 }

-TEST(VP9DecodeMultiThreadedTest, Decode2) {
-  static const FileList files[] = {
+TEST(VP9DecodeMTTest, MTDecode2) {
+  static const struct {
+    const char *name;
+    const char *expected_md5;
+  } files[] = {
    { "vp90-2-08-tile_1x2_frame_parallel.webm",
      "68ede6abd66bae0a2edf2eb9232241b6" },
    { "vp90-2-08-tile_1x4_frame_parallel.webm",
      "368ebc6ebf3a5e478d85b2c3149b2848" },
    { "vp90-2-08-tile_1x8_frame_parallel.webm",
      "17e439da2388aff3a0f69cb22579c6c1" },
-    { NULL, NULL }
  };

-  DecodeFiles(files);
+  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
+          << "threads = " << t;
+    }
+  }
 }

 // Test tile quantity changes within one file.
-TEST(VP9DecodeMultiThreadedTest, Decode3) {
-  static const FileList files[] = {
+TEST(VP9DecodeMTTest, MTDecode3) {
+  static const struct {
+    const char *name;
+    const char *expected_md5;
+  } files[] = {
    { "vp90-2-14-resize-fp-tiles-1-16.webm",
      "0cd5e632c326297e975f38949c31ea94" },
    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
@@ -270,10 +207,14 @@ TEST(VP9DecodeMultiThreadedTest, Decode3) {
      "ae96f21f21b6370cc0125621b441fc52" },
    { "vp90-2-14-resize-fp-tiles-8-4.webm",
      "3eb4f24f10640d42218f7fd7b9fd30d4" },
-    { NULL, NULL }
  };

-  DecodeFiles(files);
+  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
+          << "threads = " << t;
+    }
+  }
 }
 #endif  // CONFIG_WEBM_IO

--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -1,193 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string>
-#include "test/md5_helper.h"
-#include "test/util.h"
-#include "test/y4m_video_source.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./y4menc.h"
-
-namespace {
-
-using std::string;
-using std::tr1::make_tuple;
-
-static const unsigned int kWidth  = 160;
-static const unsigned int kHeight = 90;
-static const unsigned int kFrames = 10;
-
-typedef std::tr1::tuple<const char *, const unsigned int,
-        const vpx_img_fmt, const char *> test_entry_type;
-
-static const test_entry_type kY4mTestVectors[] = {
-  make_tuple("park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420,
-             "e5406275b9fc6bb3436c31d4a05c1cab"),
-  make_tuple("park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422,
-             "284a47a47133b12884ec3a14e959a0b6"),
-  make_tuple("park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444,
-             "90517ff33843d85de712fd4fe60dbed0"),
-  make_tuple("park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016,
-             "63f21f9f717d8b8631bd2288ee87137b"),
-  make_tuple("park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216,
-             "48ab51fb540aed07f7ff5af130c9b605"),
-  make_tuple("park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416,
-             "067bfd75aa85ff9bae91fa3e0edd1e3e"),
-  make_tuple("park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016,
-             "9e6d8f6508c6e55625f6b697bc461cef"),
-  make_tuple("park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216,
-             "b239c6b301c0b835485be349ca83a7e3"),
-  make_tuple("park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416,
-             "5a6481a550821dab6d0192f5c63845e9")
-};
-
-static void write_image_file(const vpx_image_t *img, FILE *file) {
-  int plane, y;
-  for (plane = 0; plane < 3; ++plane) {
-    const unsigned char *buf = img->planes[plane];
-    const int stride = img->stride[plane];
-    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
-    const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
-                   img->y_chroma_shift : img->d_h);
-    const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
-                   img->x_chroma_shift : img->d_w);
-    for (y = 0; y < h; ++y) {
-      fwrite(buf, bytes_per_sample, w, file);
-      buf += stride;
-    }
-  }
-}
-
-class Y4mVideoSourceTest
-    : public ::testing::TestWithParam<test_entry_type>,
-      public ::libvpx_test::Y4mVideoSource {
- protected:
-  Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
-
-  virtual ~Y4mVideoSourceTest() {
-    CloseSource();
-  }
-
-  virtual void Init(const std::string &file_name, int limit) {
-    file_name_ = file_name;
-    start_ = 0;
-    limit_ = limit;
-    frame_ = 0;
-    Begin();
-  }
-
-  // Checks y4m header information
-  void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) {
-    ASSERT_TRUE(input_file_ != NULL);
-    ASSERT_EQ(y4m_.pic_w, (int)kWidth);
-    ASSERT_EQ(y4m_.pic_h, (int)kHeight);
-    ASSERT_EQ(img()->d_w, kWidth);
-    ASSERT_EQ(img()->d_h, kHeight);
-    ASSERT_EQ(y4m_.bit_depth, bit_depth);
-    ASSERT_EQ(y4m_.vpx_fmt, fmt);
-    if (fmt == VPX_IMG_FMT_I420 || fmt == VPX_IMG_FMT_I42016) {
-      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
-      ASSERT_EQ(img()->x_chroma_shift, 1U);
-      ASSERT_EQ(img()->y_chroma_shift, 1U);
-    }
-    if (fmt == VPX_IMG_FMT_I422 || fmt == VPX_IMG_FMT_I42216) {
-      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
-      ASSERT_EQ(img()->x_chroma_shift, 1U);
-      ASSERT_EQ(img()->y_chroma_shift, 0U);
-    }
-    if (fmt == VPX_IMG_FMT_I444 || fmt == VPX_IMG_FMT_I44416) {
-      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
-      ASSERT_EQ(img()->x_chroma_shift, 0U);
-      ASSERT_EQ(img()->y_chroma_shift, 0U);
-    }
-  }
-
-  // Checks MD5 of the raw frame data
-  void Md5Check(const string &expected_md5) {
-    ASSERT_TRUE(input_file_ != NULL);
-    libvpx_test::MD5 md5;
-    for (unsigned int i = start_; i < limit_; i++) {
-      md5.Add(img());
-      Next();
-    }
-    ASSERT_EQ(string(md5.Get()), expected_md5);
-  }
-};
-
-TEST_P(Y4mVideoSourceTest, SourceTest) {
-  const char *filename = GET_PARAM(0);
-  const unsigned int bit_depth = GET_PARAM(1);
-  const vpx_img_fmt format = GET_PARAM(2);
-  const char *md5raw = GET_PARAM(3);
-
-  Init(filename, kFrames);
-  HeaderChecks(bit_depth, format);
-  Md5Check(md5raw);
-}
-
-INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
-                        ::testing::ValuesIn(kY4mTestVectors));
-
-class Y4mVideoWriteTest
-    : public Y4mVideoSourceTest {
- protected:
-  Y4mVideoWriteTest() : Y4mVideoSourceTest() {}
-
-  virtual void ReplaceInputFp(FILE *input_file) {
-    CloseSource();
-    frame_ = 0;
-    input_file_ = input_file;
-    rewind(input_file_);
-    ReadSourceToStart();
-  }
-
-  // Writes out a y4m file and then reads it back
-  void WriteY4mAndReadBack() {
-    ASSERT_TRUE(input_file_ != NULL);
-    char buf[Y4M_BUFFER_SIZE] = {0};
-    const struct VpxRational framerate = {y4m_.fps_n, y4m_.fps_d};
-    FILE *out_file = libvpx_test::OpenTempOutFile();
-    ASSERT_TRUE(out_file != NULL);
-    y4m_write_file_header(buf, sizeof(buf),
-                          kWidth, kHeight,
-                          &framerate, y4m_.vpx_fmt,
-                          y4m_.bit_depth);
-    fputs(buf, out_file);
-    for (unsigned int i = start_; i < limit_; i++) {
-      y4m_write_frame_header(buf, sizeof(buf));
-      fputs(buf, out_file);
-      write_image_file(img(), out_file);
-      Next();
-    }
-    ReplaceInputFp(out_file);
-  }
-
-  virtual void Init(const std::string &file_name, int limit) {
-    Y4mVideoSourceTest::Init(file_name, limit);
-    WriteY4mAndReadBack();
-  }
-};
-
-TEST_P(Y4mVideoWriteTest, WriteTest) {
-  const char *filename = GET_PARAM(0);
-  const unsigned int bit_depth = GET_PARAM(1);
-  const vpx_img_fmt format = GET_PARAM(2);
-  const char *md5raw = GET_PARAM(3);
-
-  Init(filename, kFrames);
-  HeaderChecks(bit_depth, format);
-  Md5Check(md5raw);
-}
-
-INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
-                        ::testing::ValuesIn(kY4mTestVectors));
-
-}  // namespace
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -38,28 +38,22 @@ class Y4mVideoSource : public VideoSource {
    CloseSource();
  }

-  virtual void OpenSource() {
+  virtual void Begin() {
    CloseSource();
    input_file_ = OpenTestDataFile(file_name_);
    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
-  }
+        << file_name_;

-  virtual void ReadSourceToStart() {
-    ASSERT_TRUE(input_file_ != NULL);
-    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    y4m_input_open(&y4m_, input_file_, NULL, 0, 0);
    framerate_numerator_ = y4m_.fps_n;
    framerate_denominator_ = y4m_.fps_d;
+
    frame_ = 0;
    for (unsigned int i = 0; i < start_; i++) {
-      Next();
+        Next();
    }
-    FillFrame();
-  }

-  virtual void Begin() {
-    OpenSource();
-    ReadSourceToStart();
+    FillFrame();
  }

  virtual void Next() {
--- a/tools_common.h
+++ b/tools_common.h
@@ -90,7 +90,6 @@ struct VpxInputContext {
  uint32_t width;
  uint32_t height;
  vpx_img_fmt_t fmt;
-  vpx_bit_depth_t bit_depth;
  int only_i420;
  uint32_t fourcc;
  struct VpxRational framerate;
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -393,12 +393,12 @@ void vp8_de_noise(VP8_COMMON                 *cm,
                  int                         low_var_thresh,
                  int                         flag)
 {
-    int mbr;
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
-    int mb_rows = cm->mb_rows;
-    int mb_cols = cm->mb_cols;
+    int mb_rows = source->y_width >> 4;
+    int mb_cols = source->y_height >> 4;
    unsigned char *limits = cm->pp_limits_buffer;;
+    int mbr, mbc;
    (void) post;
    (void) low_var_thresh;
    (void) flag;
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -552,9 +552,6 @@ $vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
    add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
    specialize qw/vp8_denoiser_filter sse2 neon/;
-    add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
-    specialize qw/vp8_denoiser_filter_uv sse2/;
-
 }

 # End of encoder only functions
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -18,18 +18,18 @@ extern "C" {

 #if HAVE_EDSP
 void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
-                             vp8_token *,
+                             const vp8_token *,
                             const vp8_extra_bit_struct *,
                             const vp8_tree_index *);
 void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
                                             unsigned char * cx_data,
                                             const unsigned char *cx_data_end,
                                             int num_parts,
-                                             vp8_token *,
+                                             const vp8_token *,
                                             const vp8_extra_bit_struct *,
                                             const vp8_tree_index *);
 void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
-                                    vp8_token *,
+                                    const vp8_token *,
                                    const vp8_extra_bit_struct *,
                                    const vp8_tree_index *);
 # define pack_tokens(a,b,c)                  \
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -191,148 +191,6 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
    return FILTER_BLOCK;
 }

-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
-                             int mc_avg_uv_stride,
-                             unsigned char *running_avg_uv,
-                             int avg_uv_stride,
-                             unsigned char *sig,
-                             int sig_stride,
-                             unsigned int motion_magnitude,
-                             int increase_denoising) {
-    unsigned char *running_avg_uv_start = running_avg_uv;
-    unsigned char *sig_start = sig;
-    int sum_diff_thresh;
-    int r, c;
-    int sum_diff = 0;
-    int sum_block = 0;
-    int adj_val[3] = {3, 4, 6};
-    int shift_inc1 = 0;
-    int shift_inc2 = 1;
-    /* If motion_magnitude is small, making the denoiser more aggressive by
-     * increasing the adjustment for each level. Add another increment for
-     * blocks that are labeled for increase denoising. */
-    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
-      if (increase_denoising) {
-        shift_inc1 = 1;
-        shift_inc2 = 2;
-      }
-      adj_val[0] += shift_inc2;
-      adj_val[1] += shift_inc2;
-      adj_val[2] += shift_inc2;
-    }
-
-    // Avoid denoising color signal if its close to average level.
-    for (r = 0; r < 8; ++r) {
-      for (c = 0; c < 8; ++c) {
-        sum_block += sig[c];
-      }
-      sig += sig_stride;
-    }
-    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
-      return COPY_BLOCK;
-    }
-
-    sig -= sig_stride * 8;
-    for (r = 0; r < 8; ++r) {
-      for (c = 0; c < 8; ++c) {
-        int diff = 0;
-        int adjustment = 0;
-        int absdiff = 0;
-
-        diff = mc_running_avg_uv[c] - sig[c];
-        absdiff = abs(diff);
-
-        // When |diff| <= |3 + shift_inc1|, use pixel value from
-        // last denoised raw.
-        if (absdiff <= 3 + shift_inc1) {
-          running_avg_uv[c] = mc_running_avg_uv[c];
-          sum_diff += diff;
-        } else {
-          if (absdiff >= 4 && absdiff <= 7)
-            adjustment = adj_val[0];
-          else if (absdiff >= 8 && absdiff <= 15)
-            adjustment = adj_val[1];
-          else
-            adjustment = adj_val[2];
-          if (diff > 0) {
-            if ((sig[c] + adjustment) > 255)
-              running_avg_uv[c] = 255;
-            else
-              running_avg_uv[c] = sig[c] + adjustment;
-            sum_diff += adjustment;
-          } else {
-            if ((sig[c] - adjustment) < 0)
-              running_avg_uv[c] = 0;
-            else
-              running_avg_uv[c] = sig[c] - adjustment;
-            sum_diff -= adjustment;
-          }
-        }
-      }
-      /* Update pointers for next iteration. */
-      sig += sig_stride;
-      mc_running_avg_uv += mc_avg_uv_stride;
-      running_avg_uv += avg_uv_stride;
-    }
-
-    sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
-    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
-    if (abs(sum_diff) > sum_diff_thresh) {
-      // Before returning to copy the block (i.e., apply no denoising), check
-      // if we can still apply some (weaker) temporal filtering to this block,
-      // that would otherwise not be denoised at all. Simplest is to apply
-      // an additional adjustment to running_avg_y to bring it closer to sig.
-      // The adjustment is capped by a maximum delta, and chosen such that
-      // in most cases the resulting sum_diff will be within the
-      // accceptable range given by sum_diff_thresh.
-
-      // The delta is set by the excess of absolute pixel diff over threshold.
-      int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
-      // Only apply the adjustment for max delta up to 3.
-      if (delta < 4) {
-        sig -= sig_stride * 8;
-        mc_running_avg_uv -= mc_avg_uv_stride * 8;
-        running_avg_uv -= avg_uv_stride * 8;
-        for (r = 0; r < 8; ++r) {
-          for (c = 0; c < 8; ++c) {
-            int diff = mc_running_avg_uv[c] - sig[c];
-            int adjustment = abs(diff);
-            if (adjustment > delta)
-              adjustment = delta;
-            if (diff > 0) {
-              // Bring denoised signal down.
-              if (running_avg_uv[c] - adjustment < 0)
-                running_avg_uv[c] = 0;
-              else
-                running_avg_uv[c] = running_avg_uv[c] - adjustment;
-              sum_diff -= adjustment;
-            } else if (diff < 0) {
-              // Bring denoised signal up.
-              if (running_avg_uv[c] + adjustment > 255)
-                running_avg_uv[c] = 255;
-              else
-                running_avg_uv[c] = running_avg_uv[c] + adjustment;
-              sum_diff += adjustment;
-            }
-          }
-          // TODO(marpan): Check here if abs(sum_diff) has gone below the
-          // threshold sum_diff_thresh, and if so, we can exit the row loop.
-          sig += sig_stride;
-          mc_running_avg_uv += mc_avg_uv_stride;
-          running_avg_uv += avg_uv_stride;
-        }
-        if (abs(sum_diff) > sum_diff_thresh)
-          return COPY_BLOCK;
-      } else {
-        return COPY_BLOCK;
-      }
-    }
-
-    vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
-                    sig_stride);
-    return FILTER_BLOCK;
-}
-
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
                          int num_mb_rows, int num_mb_cols)
 {
@@ -383,7 +241,6 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
        vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
    }
    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
-    vpx_free(denoiser->denoise_state);
 }


@@ -403,8 +260,6 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
    unsigned int motion_magnitude2;
    unsigned int sse_thresh;
    int sse_diff_thresh = 0;
-    // Denoise the UV channel.
-    int apply_color_denoise = 0;
    // Spatial loop filter: only applied selectively based on
    // temporal filter state of block relative to top/left neighbors.
    int apply_spatial_loop_filter = 1;
@@ -412,8 +267,6 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
    MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;

    enum vp8_denoiser_decision decision = FILTER_BLOCK;
-    enum vp8_denoiser_decision decision_u = FILTER_BLOCK;
-    enum vp8_denoiser_decision decision_v = FILTER_BLOCK;

    if (zero_frame)
    {
@@ -523,37 +376,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,

        /* Filter. */
        decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
-                                       running_avg_y, avg_y_stride,
-                                       x->thismb, 16, motion_magnitude2,
-                                       x->increase_denoising);
+                                         running_avg_y, avg_y_stride,
+                                         x->thismb, 16, motion_magnitude2,
+                                         x->increase_denoising);
        denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
            kFilterNonZeroMV : kFilterZeroMV;
-        // Only denoise UV for zero motion, and if y channel was denoised.
-        if (apply_color_denoise &&
-            motion_magnitude2 == 0 &&
-            decision == FILTER_BLOCK) {
-          unsigned char *mc_running_avg_u =
-              denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
-          unsigned char *running_avg_u =
-              denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
-          unsigned char *mc_running_avg_v =
-              denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
-          unsigned char *running_avg_v =
-              denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
-          int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
-          int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
-          int signal_stride = x->block[16].src_stride;
-          decision_u =
-              vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
-                                      running_avg_u, avg_uv_stride,
-                                      x->block[16].src + *x->block[16].base_src,
-                                      signal_stride, motion_magnitude2, 0);
-          decision_v =
-              vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
-                                      running_avg_v, avg_uv_stride,
-                                      x->block[20].src + *x->block[20].base_src,
-                                      signal_stride, motion_magnitude2, 0);
-        }
    }
    if (decision == COPY_BLOCK)
    {
@@ -566,21 +393,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
        denoiser->denoise_state[block_index] = kNoFilter;
    }
-    if (apply_color_denoise) {
-      if (decision_u == COPY_BLOCK) {
-        vp8_copy_mem8x8(
-            x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
-            denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
-            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
-      }
-      if (decision_v == COPY_BLOCK) {
-        vp8_copy_mem8x8(
-            x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
-            denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
-            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
-      }
-    }
-    // Option to selectively deblock the denoised signal, for y channel only.
+    // Option to selectively deblock the denoised signal.
    if (apply_spatial_loop_filter) {
      loop_filter_info lfi;
      int apply_filter_col = 0;
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -22,11 +22,6 @@ extern "C" {
 #define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)

-#define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
-#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
-#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
-#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
-
 enum vp8_denoiser_decision
 {
  COPY_BLOCK,
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -17,23 +17,10 @@
 #include <emmintrin.h>
 #include "vpx_ports/emmintrin_compat.h"

-/* Compute the sum of all pixel differences of this MB. */
-static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
-  const __m128i k_1 = _mm_set1_epi16(1);
-  const __m128i acc_diff_lo = _mm_srai_epi16(
-      _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
-  const __m128i acc_diff_hi = _mm_srai_epi16(
-      _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
-  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
-  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
-  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
-                                          _mm_srli_si128(hg_fe_dc_ba, 8));
-  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
-                                         _mm_srli_si128(hgfe_dcba, 4));
-  unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba);
-
-  return abs(sum_diff);
-}
+union sum_union {
+    __m128i v;
+    signed char e[16];
+};

 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
                             int mc_avg_y_stride,
@@ -44,7 +31,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
 {
    unsigned char *running_avg_y_start = running_avg_y;
    unsigned char *sig_start = sig;
-    unsigned int sum_diff_thresh;
+    int sum_diff_thresh;
    int r;
    int shift_inc  = (increase_denoising &&
        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
@@ -116,10 +103,16 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

    {
        /* Compute the sum of all pixel differences of this MB. */
-        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+        union sum_union s;
+        int sum_diff = 0;
+        s.v = acc_diff;
+        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
+                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
+                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];
+
        sum_diff_thresh = SUM_DIFF_THRESHOLD;
        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
-        if (abs_sum_diff > sum_diff_thresh) {
+        if (abs(sum_diff) > sum_diff_thresh) {
          // Before returning to copy the block (i.e., apply no denoising),
          // checK if we can still apply some (weaker) temporal filtering to
          // this block, that would otherwise not be denoised at all. Simplest
@@ -130,7 +123,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

          // The delta is set by the excess of absolute pixel diff over the
          // threshold.
-          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
+          int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
          // Only apply the adjustment for max delta up to 3.
          if (delta < 4) {
            const __m128i k_delta = _mm_set1_epi8(delta);
@@ -169,9 +162,16 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
             mc_running_avg_y += mc_avg_y_stride;
             running_avg_y += avg_y_stride;
            }
-            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
-            if (abs_sum_diff > sum_diff_thresh) {
-              return COPY_BLOCK;
+            {
+              // Update the sum of all pixel differences of this MB.
+              union sum_union s;
+              s.v = acc_diff;
+              sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
+                       + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
+                       + s.e[12] + s.e[13] + s.e[14] + s.e[15];
+              if (abs(sum_diff) > sum_diff_thresh) {
+                return COPY_BLOCK;
+              }
            }
          } else {
            return COPY_BLOCK;
@@ -182,198 +182,3 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
    vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
    return FILTER_BLOCK;
 }
-
-int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
-                             int mc_avg_stride,
-                             unsigned char *running_avg, int avg_stride,
-                             unsigned char *sig, int sig_stride,
-                             unsigned int motion_magnitude,
-                             int increase_denoising) {
-    unsigned char *running_avg_start = running_avg;
-    unsigned char *sig_start = sig;
-    unsigned int sum_diff_thresh;
-    int r;
-    int shift_inc  = (increase_denoising &&
-        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
-    __m128i acc_diff = _mm_setzero_si128();
-    const __m128i k_0 = _mm_setzero_si128();
-    const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
-    const __m128i k_8 = _mm_set1_epi8(8);
-    const __m128i k_16 = _mm_set1_epi8(16);
-    /* Modify each level's adjustment according to motion_magnitude. */
-    const __m128i l3 = _mm_set1_epi8(
-                       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ?
-                        7 + shift_inc : 6);
-    /* Difference between level 3 and level 2 is 2. */
-    const __m128i l32 = _mm_set1_epi8(2);
-    /* Difference between level 2 and level 1 is 1. */
-    const __m128i l21 = _mm_set1_epi8(1);
-
-    {
-      const __m128i k_1 = _mm_set1_epi16(1);
-      __m128i vec_sum_block = _mm_setzero_si128();
-
-      // Avoid denoising color signal if its close to average level.
-      for (r = 0; r < 8; ++r) {
-        const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
-        const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
-        vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
-        sig += sig_stride;
-      }
-      sig -= sig_stride * 8;
-      {
-        const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
-        const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
-                                                _mm_srli_si128(hg_fe_dc_ba, 8));
-        const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
-                                               _mm_srli_si128(hgfe_dcba, 4));
-        const int sum_block = _mm_cvtsi128_si32(hgfedcba);
-        if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
-          return COPY_BLOCK;
-        }
-      }
-    }
-
-    for (r = 0; r < 4; ++r) {
-        /* Calculate differences */
-        const __m128i v_sig_low = _mm_castpd_si128(
-            _mm_load_sd((double *)(&sig[0])));
-        const __m128i v_sig = _mm_castpd_si128(
-            _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
-                         (double *)(&sig[sig_stride])));
-        const __m128i v_mc_running_avg_low = _mm_castpd_si128(
-            _mm_load_sd((double *)(&mc_running_avg[0])));
-        const __m128i v_mc_running_avg = _mm_castpd_si128(
-            _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
-                         (double *)(&mc_running_avg[mc_avg_stride])));
-        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
-        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
-        /* Obtain the sign. FF if diff is negative. */
-        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
-        /* Clamp absolute difference to 16 to be used to get mask. Doing this
-         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
-        const __m128i clamped_absdiff = _mm_min_epu8(
-                                        _mm_or_si128(pdiff, ndiff), k_16);
-        /* Get masks for l2 l1 and l0 adjustments */
-        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
-        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
-        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
-        /* Get adjustments for l2, l1, and l0 */
-        __m128i adj2 = _mm_and_si128(mask2, l32);
-        const __m128i adj1 = _mm_and_si128(mask1, l21);
-        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
-        __m128i adj,  padj, nadj;
-        __m128i v_running_avg;
-
-        /* Combine the adjustments and get absolute adjustments. */
-        adj2 = _mm_add_epi8(adj2, adj1);
-        adj = _mm_sub_epi8(l3, adj2);
-        adj = _mm_andnot_si128(mask0, adj);
-        adj = _mm_or_si128(adj, adj0);
-
-        /* Restore the sign and get positive and negative adjustments. */
-        padj = _mm_andnot_si128(diff_sign, adj);
-        nadj = _mm_and_si128(diff_sign, adj);
-
-        /* Calculate filtered value. */
-        v_running_avg = _mm_adds_epu8(v_sig, padj);
-        v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
-
-        _mm_storel_pd((double *)&running_avg[0],
-                      _mm_castsi128_pd(v_running_avg));
-        _mm_storeh_pd((double *)&running_avg[avg_stride],
-                      _mm_castsi128_pd(v_running_avg));
-
-        /* Adjustments <=7, and each element in acc_diff can fit in signed
-         * char.
-         */
-        acc_diff = _mm_adds_epi8(acc_diff, padj);
-        acc_diff = _mm_subs_epi8(acc_diff, nadj);
-
-        /* Update pointers for next iteration. */
-        sig += sig_stride * 2;
-        mc_running_avg += mc_avg_stride * 2;
-        running_avg += avg_stride * 2;
-    }
-
-    {
-        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
-        sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
-        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
-        if (abs_sum_diff > sum_diff_thresh) {
-          // Before returning to copy the block (i.e., apply no denoising),
-          // checK if we can still apply some (weaker) temporal filtering to
-          // this block, that would otherwise not be denoised at all. Simplest
-          // is to apply an additional adjustment to running_avg_y to bring it
-          // closer to sig. The adjustment is capped by a maximum delta, and
-          // chosen such that in most cases the resulting sum_diff will be
-          // within the accceptable range given by sum_diff_thresh.
-
-          // The delta is set by the excess of absolute pixel diff over the
-          // threshold.
-          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
-          // Only apply the adjustment for max delta up to 3.
-          if (delta < 4) {
-            const __m128i k_delta = _mm_set1_epi8(delta);
-            sig -= sig_stride * 8;
-            mc_running_avg -= mc_avg_stride * 8;
-            running_avg -= avg_stride * 8;
-            for (r = 0; r < 4; ++r) {
-              // Calculate differences.
-              const __m128i v_sig_low = _mm_castpd_si128(
-                  _mm_load_sd((double *)(&sig[0])));
-              const __m128i v_sig = _mm_castpd_si128(
-                  _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
-                               (double *)(&sig[sig_stride])));
-              const __m128i v_mc_running_avg_low = _mm_castpd_si128(
-                  _mm_load_sd((double *)(&mc_running_avg[0])));
-              const __m128i v_mc_running_avg = _mm_castpd_si128(
-                  _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
-                               (double *)(&mc_running_avg[mc_avg_stride])));
-              const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
-              const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
-              // Obtain the sign. FF if diff is negative.
-              const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
-              // Clamp absolute difference to delta to get the adjustment.
-              const __m128i adj =
-                  _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
-              // Restore the sign and get positive and negative adjustments.
-              __m128i padj, nadj;
-              const __m128i v_running_avg_low = _mm_castpd_si128(
-                  _mm_load_sd((double *)(&running_avg[0])));
-              __m128i v_running_avg = _mm_castpd_si128(
-                  _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
-                               (double *)(&running_avg[avg_stride])));
-              padj = _mm_andnot_si128(diff_sign, adj);
-              nadj = _mm_and_si128(diff_sign, adj);
-              // Calculate filtered value.
-              v_running_avg = _mm_subs_epu8(v_running_avg, padj);
-              v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
-
-              _mm_storel_pd((double *)&running_avg[0],
-                            _mm_castsi128_pd(v_running_avg));
-              _mm_storeh_pd((double *)&running_avg[avg_stride],
-                            _mm_castsi128_pd(v_running_avg));
-
-             // Accumulate the adjustments.
-             acc_diff = _mm_subs_epi8(acc_diff, padj);
-             acc_diff = _mm_adds_epi8(acc_diff, nadj);
-
-             // Update pointers for next iteration.
-             sig += sig_stride * 2;
-             mc_running_avg += mc_avg_stride * 2;
-             running_avg += avg_stride * 2;
-            }
-            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
-            if (abs_sum_diff > sum_diff_thresh) {
-              return COPY_BLOCK;
-            }
-          } else {
-            return COPY_BLOCK;
-          }
-        }
-    }
-
-    vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
-    return FILTER_BLOCK;
-}
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -97,15 +97,14 @@ static void free_mi(VP9_COMMON *cm) {

 void vp9_free_frame_buffers(VP9_COMMON *cm) {
  int i;
-  BufferPool *const pool = cm->buffer_pool;

  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&pool->frame_bufs[i].buf);
+    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);

-    if (pool->frame_bufs[i].ref_count > 0 &&
-        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
-      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
-      pool->frame_bufs[i].ref_count = 0;
+    if (cm->frame_bufs[i].ref_count > 0 &&
+        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
+      cm->frame_bufs[i].ref_count = 0;
    }
  }

@@ -128,15 +127,12 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
 int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;

-  // TODO(agrange): this should be conditionally allocated.
  if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
    goto fail;
-#endif

  set_mb_mi(cm, aligned_width, aligned_height);

@@ -177,14 +173,13 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {

 static void init_frame_bufs(VP9_COMMON *cm) {
  int i;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  frame_bufs[cm->new_fb_idx].ref_count = 1;
+  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;

  for (i = 0; i < REF_FRAMES; ++i) {
    cm->ref_frame_map[i] = i;
-    frame_bufs[i].ref_count = 1;
+    cm->frame_bufs[i].ref_count = 1;
  }
 }

@@ -192,24 +187,21 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
  int i;
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  vp9_free_frame_buffers(cm);

  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    frame_bufs[i].ref_count = 0;
-    if (vp9_alloc_frame_buffer(&frame_bufs[i].buf, width, height,
+    cm->frame_bufs[i].ref_count = 0;
+    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
      goto fail;
  }

  init_frame_bufs(cm);

-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9_ENC_BORDER_IN_PIXELS) < 0)
    goto fail;
-#endif

  return 0;

@@ -259,7 +251,7 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
 void vp9_remove_common(VP9_COMMON *cm) {
  vp9_free_frame_buffers(cm);
  vp9_free_context_buffers(cm);
-  vp9_free_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
 }

 void vp9_update_frame_size(VP9_COMMON *cm) {
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -44,7 +44,7 @@ void vp9_foreach_transformed_block_in_plane(
  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
  // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd)
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
                                : mbmi->tx_size;
  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -32,6 +32,9 @@ extern "C" {
 #define BLOCK_SIZE_GROUPS 4
 #define SKIP_CONTEXTS 3
 #define INTER_MODE_CONTEXTS 7
+#if CONFIG_COPY_CODING
+#define COPY_MODE_CONTEXTS 5
+#endif

 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
@@ -79,6 +82,16 @@ typedef enum {
  MB_MODE_COUNT
 } PREDICTION_MODE;

+#if CONFIG_COPY_CODING
+typedef enum {
+  NOREF,
+  REF0,
+  REF1,
+  REF2,
+  COPY_MODE_COUNT
+} COPY_MODE;
+#endif
+
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
  return mode >= NEARESTMV && mode <= NEWMV;
 }
@@ -118,11 +131,86 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
  return mi_width_log2_lookup[sb_type];
 }

+#if CONFIG_SUPERTX
+static INLINE TX_SIZE bsize_to_tx_size(BLOCK_SIZE bsize) {
+  const TX_SIZE tx_size_lookup[BLOCK_SIZES] = {
+    TX_4X4, TX_4X4, TX_4X4,
+    TX_8X8, TX_8X8, TX_8X8,
+    TX_16X16, TX_16X16, TX_16X16,
+    TX_32X32, TX_32X32, TX_32X32, TX_32X32};
+  return tx_size_lookup[bsize];
+}
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+#define MASK_BITS_SML   3
+#define MASK_BITS_MED   4
+#define MASK_BITS_BIG   5
+#define MASK_NONE      -1
+
+static inline int get_mask_bits(BLOCK_SIZE sb_type) {
+  if (sb_type < BLOCK_8X8)
+    return 0;
+  if (sb_type <= BLOCK_8X8)
+    return MASK_BITS_SML;
+  else if (sb_type <= BLOCK_32X32)
+    return MASK_BITS_MED;
+  else
+    return MASK_BITS_BIG;
+}
+#endif
+
+#if CONFIG_INTERINTRA
+static INLINE TX_SIZE intra_size_log2_for_interintra(int bs) {
+  switch (bs) {
+    case 4:
+      return TX_4X4;
+      break;
+    case 8:
+      return TX_8X8;
+      break;
+    case 16:
+      return TX_16X16;
+      break;
+    case 32:
+      return TX_32X32;
+      break;
+    default:
+      return TX_32X32;
+      break;
+  }
+}
+
+static INLINE int is_interintra_allowed(BLOCK_SIZE sb_type) {
+  return ((sb_type >= BLOCK_8X8) && (sb_type < BLOCK_64X64));
+}
+
+#if CONFIG_MASKED_INTERINTRA
+#define MASK_BITS_SML_INTERINTRA   3
+#define MASK_BITS_MED_INTERINTRA   4
+#define MASK_BITS_BIG_INTERINTRA   5
+#define MASK_NONE_INTERINTRA      -1
+static INLINE int get_mask_bits_interintra(BLOCK_SIZE sb_type) {
+  if (sb_type == BLOCK_4X4)
+     return 0;
+  if (sb_type <= BLOCK_8X8)
+    return MASK_BITS_SML_INTERINTRA;
+  else if (sb_type <= BLOCK_32X32)
+    return MASK_BITS_MED_INTERINTRA;
+  else
+    return MASK_BITS_BIG_INTERINTRA;
+}
+#endif
+#endif
+
 // This structure now relates to 8x8 block regions.
 typedef struct {
  // Common for both INTER and INTRA blocks
  BLOCK_SIZE sb_type;
  PREDICTION_MODE mode;
+#if CONFIG_FILTERINTRA
+  int filterbit, uv_filterbit;
+#endif
  TX_SIZE tx_size;
  uint8_t skip;
  uint8_t segment_id;
@@ -137,10 +225,34 @@ typedef struct {
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  uint8_t mode_context[MAX_REF_FRAMES];
  INTERP_FILTER interp_filter;
+
+#if CONFIG_EXT_TX
+  EXT_TX_TYPE ext_txfrm;
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+  int use_masked_interinter;
+  int mask_index;
+#endif
+#if CONFIG_INTERINTRA
+  PREDICTION_MODE interintra_mode, interintra_uv_mode;
+#if CONFIG_MASKED_INTERINTRA
+  int interintra_mask_index;
+  int interintra_uv_mask_index;
+  int use_masked_interintra;
+#endif
+#endif
+#if CONFIG_COPY_CODING
+  COPY_MODE copy_mode;
+  int inter_ref_count;
+#endif
 } MB_MODE_INFO;

 typedef struct {
  MB_MODE_INFO mbmi;
+#if CONFIG_FILTERINTRA
+  int b_filter_info[4];
+#endif
  b_mode_info bmi[4];
 } MODE_INFO;

@@ -149,6 +261,16 @@ static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
                                      : mi->mbmi.mode;
 }

+#if CONFIG_FILTERINTRA
+static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
+  return 1;
+}
+
+static INLINE int is_filter_enabled(TX_SIZE txsize) {
+  return (txsize <= TX_32X32);
+}
+#endif
+
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
  return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@@ -240,6 +362,13 @@ typedef struct macroblockd {
  PARTITION_CONTEXT left_seg_context[8];
 } MACROBLOCKD;

+#if CONFIG_SUPERTX
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  return mbmi->tx_size >
+         MIN(b_width_log2(mbmi->sb_type), b_height_log2(mbmi->sb_type));
+}
+#endif
+
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                     PARTITION_TYPE partition) {
  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
@@ -253,8 +382,20 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
                                  const MACROBLOCKD *xd) {
  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;

+#if !CONFIG_EXT_TX
  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
    return DCT_DCT;
+#else
+  if (plane_type != PLANE_TYPE_Y)
+      return DCT_DCT;
+
+  if (is_inter_block(mbmi)) {
+    if (mbmi->ext_txfrm == NORM || mbmi->tx_size >= TX_32X32)
+      return DCT_DCT;
+    else
+      return ADST_ADST;
+  }
+#endif
  return intra_mode_to_tx_type_lookup[mbmi->mode];
 }

@@ -262,28 +403,46 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd, int ib) {
  const MODE_INFO *const mi = xd->mi[0];

+#if !CONFIG_EXT_TX
  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
    return DCT_DCT;
+#else
+  if (plane_type != PLANE_TYPE_Y || xd->lossless)
+      return DCT_DCT;
+
+  if (is_inter_block(&mi->mbmi)) {
+    if (mi->mbmi.ext_txfrm == NORM)
+      return DCT_DCT;
+    else
+      return ADST_ADST;
+  }
+#endif

  return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
 }

 void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);

-static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
-                                          int xss, int yss) {
+static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
  if (bsize < BLOCK_8X8) {
    return TX_4X4;
  } else {
-    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
+    // TODO(dkovalev): Assuming YUV420 (ss_x == 1, ss_y == 1)
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][1][1];
    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
  }
 }

-static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
-                                     const struct macroblockd_plane *pd) {
-  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
-                             pd->subsampling_y);
+static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+#if CONFIG_SUPERTX
+  if (!supertx_enabled(mbmi)) {
+#endif
+  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type);
+#if CONFIG_SUPERTX
+  } else {
+    return uvsupertx_size_lookup[mbmi->tx_size];
+  }
+#endif
 }

 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -133,6 +133,15 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };

+#if CONFIG_SUPERTX
+const TX_SIZE uvsupertx_size_lookup[TX_SIZES] = {
+    TX_4X4,
+    TX_4X4,
+    TX_8X8,
+    TX_16X16
+};
+#endif
+
 // Generates 4 bit field in which each bit set to 1 represents
 // a blocksize partition  1111 means we split 64x64, 32x32, 16x16
 // and 8x8.  1000 means we just split the 64x64 to 32x32
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -31,6 +31,9 @@ extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_SUPERTX
+extern const TX_SIZE uvsupertx_size_lookup[TX_SIZES];
+#endif

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -13,6 +13,84 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"

+#if CONFIG_MASKED_INTERINTER
+static const vp9_prob default_masked_interinter_prob[BLOCK_SIZES] = {
+    192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#endif
+
+#if CONFIG_INTERINTRA
+static const vp9_prob default_interintra_prob[BLOCK_SIZES] = {
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#if CONFIG_MASKED_INTERINTRA
+static const vp9_prob default_masked_interintra_prob[BLOCK_SIZES] = {
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#endif
+#endif
+
+#if CONFIG_FILTERINTRA
+static const vp9_prob default_filterintra_prob[TX_SIZES][INTRA_MODES] = {
+  // DC     V      H    D45   D135   D117   D153   D207    D63     TM
+  {153,   171,   147,   150,   129,   101,   100,   153,   132,   111},
+  {171,   173,   185,   131,    70,    53,    70,   148,   127,   114},
+  {175,   203,   213,    86,    45,    71,    41,   150,   125,   154},
+  {235,   230,   154,   202,   154,   205,    37,   128,     0,   202}
+};
+#endif
+
+#if CONFIG_EXT_TX
+static const vp9_prob default_ext_tx_prob = 178;
+#endif
+
+#if CONFIG_SUPERTX
+static const vp9_prob default_supertx_prob[TX_SIZES] = {
+  255, 160, 160, 160
+};
+
+static const vp9_prob default_supertxsplit_prob[TX_SIZES] = {
+  255, 200, 200, 200
+};
+#endif
+
+#if CONFIG_COPY_CODING
+static const vp9_prob default_copy_noref_prob[COPY_MODE_CONTEXTS]
+                                             [BLOCK_SIZES] = {
+  {255, 255, 255,  82, 148, 182,  65, 193, 158,  70, 138, 101,  23},
+  {255, 255, 255, 118, 153, 161, 123, 169, 157,  82, 101, 123,  88},
+  {255, 255, 255, 130, 178, 226, 194, 196, 174, 173, 135, 144, 141},
+  {255, 255, 255, 178, 218, 225, 197, 230, 222, 215, 220, 220, 220},
+  {255, 255, 255, 243, 248, 241, 233, 249, 249, 249, 249, 249, 249}
+};
+
+static const vp9_prob default_copy_mode_probs_l2[COPY_MODE_CONTEXTS][1] = {
+  {207},
+  {135},
+  {141},
+  {189},
+  {209}
+};
+
+const vp9_tree_index vp9_copy_mode_tree_l2[TREE_SIZE(2)] = {
+  -(REF0 - REF0), -(REF1 - REF0)
+};
+
+static const vp9_prob default_copy_mode_probs[COPY_MODE_CONTEXTS]
+                                             [COPY_MODE_COUNT - 2] = {
+  {130, 159},
+  {126, 176},
+  {120, 150},
+  {158, 183},
+  {149, 125}
+};
+
+const vp9_tree_index vp9_copy_mode_tree[TREE_SIZE(COPY_MODE_COUNT - 1)] = {
+  -(REF0 - REF0),  2,
+  -(REF1 - REF0),  -(REF2 - REF0)
+};
+#endif
+
 const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
  {  // above = dc
    { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
@@ -245,7 +323,11 @@ const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
 };

 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+#if !CONFIG_COPY_CODING
  9, 102, 187, 225
+#else
+  35, 112, 187, 225
+#endif
 };

 static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
@@ -326,6 +408,30 @@ void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
  fc->tx_probs = default_tx_probs;
  vp9_copy(fc->skip_probs, default_skip_probs);
  vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
+#if CONFIG_MASKED_INTERINTER
+  vp9_copy(fc->masked_interinter_prob, default_masked_interinter_prob);
+#endif
+#if CONFIG_INTERINTRA
+  vp9_copy(fc->interintra_prob, default_interintra_prob);
+#if CONFIG_MASKED_INTERINTRA
+  vp9_copy(fc->masked_interintra_prob, default_masked_interintra_prob);
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  vp9_copy(fc->filterintra_prob, default_filterintra_prob);
+#endif
+#if CONFIG_EXT_TX
+  fc->ext_tx_prob = default_ext_tx_prob;
+#endif
+#if CONFIG_SUPERTX
+  vp9_copy(fc->supertx_prob, default_supertx_prob);
+  vp9_copy(fc->supertxsplit_prob, default_supertxsplit_prob);
+#endif
+#if CONFIG_COPY_CODING
+  vp9_copy(fc->copy_noref_prob, default_copy_noref_prob);
+  vp9_copy(fc->copy_mode_probs_l2, default_copy_mode_probs_l2);
+  vp9_copy(fc->copy_mode_probs, default_copy_mode_probs);
+#endif
 }

 const vp9_tree_index vp9_switchable_interp_tree
@@ -416,6 +522,73 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {

  for (i = 0; i < SKIP_CONTEXTS; ++i)
    fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
+
+#if CONFIG_MASKED_INTERINTER
+  if (cm->use_masked_interinter) {
+    for (i = 0; i < BLOCK_SIZES; ++i) {
+      if (get_mask_bits(i))
+        fc->masked_interinter_prob[i] = adapt_prob
+                                      (pre_fc->masked_interinter_prob[i],
+                                       counts->masked_interinter[i]);
+    }
+  }
+#endif
+
+#if CONFIG_INTERINTRA
+  if (cm->use_interintra) {
+    for (i = 0; i < BLOCK_SIZES; ++i) {
+      if (is_interintra_allowed(i))
+        fc->interintra_prob[i] = adapt_prob(pre_fc->interintra_prob[i],
+                                            counts->interintra[i]);
+    }
+#if CONFIG_MASKED_INTERINTRA
+    if (cm->use_masked_interintra) {
+      for (i = 0; i < BLOCK_SIZES; ++i) {
+        if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
+          fc->masked_interintra_prob[i] = adapt_prob(
+                                          pre_fc->masked_interintra_prob[i],
+                                          counts->masked_interintra[i]);
+      }
+    }
+#endif
+  }
+#endif
+
+#if CONFIG_FILTERINTRA
+  for (i = 0; i < TX_SIZES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      fc->filterintra_prob[i][j] = adapt_prob(pre_fc->filterintra_prob[i][j],
+                                   counts->filterintra[i][j]);
+#endif
+
+#if CONFIG_EXT_TX
+  fc->ext_tx_prob = adapt_prob(pre_fc->ext_tx_prob, counts->ext_tx);
+#endif
+
+#if CONFIG_SUPERTX
+  for (i = 1; i < TX_SIZES; ++i) {
+    fc->supertx_prob[i] = adapt_prob(pre_fc->supertx_prob[i],
+                                     counts->supertx[i]);
+  }
+
+  for (i = 1; i < TX_SIZES; ++i) {
+    fc->supertxsplit_prob[i] = adapt_prob(pre_fc->supertxsplit_prob[i],
+                                          counts->supertxsplit[i]);
+  }
+#endif
+
+#if CONFIG_COPY_CODING
+  for (i = 0; i < COPY_MODE_CONTEXTS; i++) {
+    for (j = BLOCK_8X8; j < BLOCK_SIZES; j++) {
+      fc->copy_noref_prob[i][j] =
+          adapt_prob(pre_fc->copy_noref_prob[i][j], counts->copy_noref[i][j]);
+    }
+    adapt_probs(vp9_copy_mode_tree_l2, pre_fc->copy_mode_probs_l2[i],
+                counts->copy_mode_l2[i], fc->copy_mode_probs_l2[i]);
+    adapt_probs(vp9_copy_mode_tree, pre_fc->copy_mode_probs[i],
+                counts->copy_mode[i], fc->copy_mode_probs[i]);
+  }
+#endif
 }

 static void set_default_lf_deltas(struct loopfilter *lf) {
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -52,6 +52,30 @@ typedef struct frame_contexts {
  struct tx_probs tx_probs;
  vp9_prob skip_probs[SKIP_CONTEXTS];
  nmv_context nmvc;
+#if CONFIG_MASKED_INTERINTER
+  vp9_prob masked_interinter_prob[BLOCK_SIZES];
+#endif
+#if CONFIG_INTERINTRA
+  vp9_prob interintra_prob[BLOCK_SIZES];
+#if CONFIG_MASKED_INTERINTRA
+  vp9_prob masked_interintra_prob[BLOCK_SIZES];
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  vp9_prob filterintra_prob[TX_SIZES][INTRA_MODES];
+#endif
+#if CONFIG_EXT_TX
+  vp9_prob ext_tx_prob;
+#endif
+#if CONFIG_SUPERTX
+  vp9_prob supertx_prob[TX_SIZES];
+  vp9_prob supertxsplit_prob[TX_SIZES];
+#endif
+#if CONFIG_COPY_CODING
+  vp9_prob copy_noref_prob[COPY_MODE_CONTEXTS][BLOCK_SIZES];
+  vp9_prob copy_mode_probs_l2[COPY_MODE_CONTEXTS][1];
+  vp9_prob copy_mode_probs[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 2];
+#endif
 } FRAME_CONTEXT;

 typedef struct {
@@ -71,6 +95,31 @@ typedef struct {
  struct tx_counts tx;
  unsigned int skip[SKIP_CONTEXTS][2];
  nmv_context_counts mv;
+#if CONFIG_MASKED_INTERINTER
+  unsigned int masked_interinter[BLOCK_SIZES][2];
+#endif
+#if CONFIG_INTERINTRA
+  unsigned int interintra[BLOCK_SIZES][2];
+#if CONFIG_MASKED_INTERINTRA
+  unsigned int masked_interintra[BLOCK_SIZES][2];
+#endif
+#endif
+#if CONFIG_FILTERINTRA
+  unsigned int filterintra[TX_SIZES][INTRA_MODES][2];
+#endif
+#if CONFIG_EXT_TX
+  unsigned int ext_tx[2];
+#endif
+#if CONFIG_SUPERTX
+  unsigned int supertx[TX_SIZES][2];
+  unsigned int supertxsplit[TX_SIZES][2];
+  unsigned int supertx_size[BLOCK_SIZES];
+#endif
+#if CONFIG_COPY_CODING
+  unsigned int copy_noref[COPY_MODE_CONTEXTS][BLOCK_SIZES][2];
+  unsigned int copy_mode_l2[COPY_MODE_CONTEXTS][2];
+  unsigned int copy_mode[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 1];
+#endif
 } FRAME_COUNTS;

 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
@@ -83,6 +132,10 @@ extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern const vp9_tree_index vp9_switchable_interp_tree
                                [TREE_SIZE(SWITCHABLE_FILTERS)];
+#if CONFIG_COPY_CODING
+extern const vp9_tree_index vp9_copy_mode_tree_l2[TREE_SIZE(2)];
+extern const vp9_tree_index vp9_copy_mode_tree[TREE_SIZE(COPY_MODE_COUNT - 1)];
+#endif

 void vp9_setup_past_independence(struct VP9Common *cm);

--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -100,6 +100,14 @@ typedef enum {
  TX_TYPES = 4
 } TX_TYPE;

+#if CONFIG_EXT_TX
+typedef enum {
+  NORM   = 0,
+  ALT = 1,
+  EXT_TX_TYPES = 2
+} EXT_TX_TYPE;
+#endif
+
 typedef enum {
  UNKNOWN    = 0,
  BT_601     = 1,  // YUV
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c
@@ -76,7 +76,6 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
 int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
  (void)cb_priv;
-  if (int_fb)
-    int_fb->in_use = 0;
+  int_fb->in_use = 0;
  return 0;
 }
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -206,6 +206,13 @@ static const int mode_lf_lut[MB_MODE_COUNT] = {
  1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
 };

+#if CONFIG_SUPERTX
+static int supertx_enabled_lpf(const MB_MODE_INFO *mbmi) {
+  return mbmi->tx_size >
+         MIN(b_width_log2(mbmi->sb_type), b_height_log2(mbmi->sb_type));
+}
+#endif
+
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
  int lvl;

@@ -502,7 +509,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
  const MB_MODE_INFO *mbmi = &mi->mbmi;
  const BLOCK_SIZE block_size = mbmi->sb_type;
  const TX_SIZE tx_size_y = mbmi->tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
  const int filter_level = get_filter_level(lfi_n, mbmi);
  uint64_t *const left_y = &lfm->left_y[tx_size_y];
  uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -572,6 +579,85 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
 }

+#if CONFIG_SUPERTX
+static void build_masks_supertx(const loop_filter_info_n *const lfi_n,
+                                const MODE_INFO *mi, const int shift_y,
+                                const int shift_uv,
+                                LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
+  const BLOCK_SIZE block_size = 3 * (int)tx_size_y;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+  int i;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16,   we'll set :
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and v set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+
+  // Here we are adding a mask for the transform size.  The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+
+  if (tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+}
+#endif
+
 // This function does the same thing as the one above with the exception that
 // it only affects the y masks.   It exists because for blocks < 16x16 in size,
 // we only update u and v masks on the first block.
@@ -615,6 +701,48 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
 }

+#if CONFIG_SUPERTX
+static void build_y_mask_supertx(const loop_filter_info_n *const lfi_n,
+                                 const MODE_INFO *mi, const int shift_y,
+                                 LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const BLOCK_SIZE block_size = 3 * (int)tx_size_y;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  int i;
+
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+}
+#endif
+
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
@@ -650,6 +778,9 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+#if CONFIG_SUPERTX
+  int supertx;
+#endif

  vp9_zero(*lfm);

@@ -687,20 +818,43 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            break;
          case BLOCK_32X16:
+#if CONFIG_SUPERTX
+            supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+            if (!supertx) {
+#endif
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            if (mi_32_row_offset + 2 >= max_rows)
              continue;
            mip2 = mip + mode_info_stride * 2;
            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+#if CONFIG_SUPERTX
+            } else {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            }
+#endif
            break;
          case BLOCK_16X32:
+#if CONFIG_SUPERTX
+            supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+            if (!supertx) {
+#endif
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            if (mi_32_col_offset + 2 >= max_cols)
              continue;
            mip2 = mip + 2;
            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+#if CONFIG_SUPERTX
+            } else {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            }
+#endif
            break;
          default:
+#if CONFIG_SUPERTX
+            if (mip[0]->mbmi.tx_size == TX_32X32) {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            } else {
+#endif
            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
@@ -717,24 +871,56 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  break;
                case BLOCK_16X8:
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  if (mi_16_row_offset + 1 >= max_rows)
                    continue;
                  mip2 = mip + mode_info_stride;
                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  break;
                case BLOCK_8X16:
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  if (mi_16_col_offset +1 >= max_cols)
                    continue;
                  mip2 = mip + 1;
                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  break;
                default: {
+#if CONFIG_SUPERTX
+                  if (mip[0]->mbmi.tx_size == TX_16X16) {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  } else {
+#endif
                  const int shift_y = shift_32_y[idx_32] +
                                      shift_16_y[idx_16] +
                                      shift_8_y[0];
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  mip += offset[0];
                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
                    const int shift_y = shift_32_y[idx_32] +
@@ -748,12 +934,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                    if (mi_8_col_offset >= max_cols ||
                        mi_8_row_offset >= max_rows)
                      continue;
+#if CONFIG_SUPERTX
+                    supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                    if (!supertx)
+#endif
                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+#if CONFIG_SUPERTX
+                    else
+                      build_y_mask_supertx(lfi_n, mip[0], shift_y, lfm);
+#endif
                  }
+#if CONFIG_SUPERTX
+                  }
+#endif
                  break;
                }
              }
            }
+#if CONFIG_SUPERTX
+            }
+#endif
            break;
        }
      }
@@ -939,7 +1139,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
      const int skip_this_r = skip_this && !block_edge_above;
      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi, plane)
+                            ? get_uv_tx_size(&mi[0].mbmi)
                            : mi[0].mbmi.tx_size;
      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -188,3 +188,176 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
      assert("Invalid block index.");
  }
 }
+
+#if CONFIG_COPY_CODING
+static int compare_interinfo(MB_MODE_INFO *mbmi, MB_MODE_INFO *ref_mbmi) {
+  if (mbmi == ref_mbmi) {
+    return 1;
+  } else {
+    int is_same;
+#if CONFIG_INTERINTRA
+    MV_REFERENCE_FRAME mbmi_ref1_backup = mbmi->ref_frame[1];
+    MV_REFERENCE_FRAME refmbmi_ref1_backup = ref_mbmi->ref_frame[1];
+
+    if (mbmi->ref_frame[1] == INTRA_FRAME)
+      mbmi->ref_frame[1] = NONE;
+    if (ref_mbmi->ref_frame[1] == INTRA_FRAME)
+      ref_mbmi->ref_frame[1] = NONE;
+#endif
+    if (mbmi->ref_frame[0] == ref_mbmi->ref_frame[0] &&
+        mbmi->ref_frame[1] == ref_mbmi->ref_frame[1]) {
+      if (mbmi->ref_frame[1] > INTRA_FRAME)
+        is_same = mbmi->mv[0].as_int == ref_mbmi->mv[0].as_int &&
+                  mbmi->mv[1].as_int == ref_mbmi->mv[1].as_int &&
+                  mbmi->interp_filter == ref_mbmi->interp_filter;
+      else
+        is_same = mbmi->mv[0].as_int == ref_mbmi->mv[0].as_int &&
+                  mbmi->interp_filter == ref_mbmi->interp_filter;
+    } else {
+      is_same = 0;
+    }
+#if CONFIG_INTERINTRA
+    mbmi->ref_frame[1] = mbmi_ref1_backup;
+    ref_mbmi->ref_frame[1] = refmbmi_ref1_backup;
+#endif
+
+    return is_same;
+  }
+}
+
+static int check_inside(VP9_COMMON *cm, int mi_row, int mi_col) {
+  return mi_row >= 0 && mi_col >= 0 &&
+         mi_row < cm->mi_rows && mi_col < cm->mi_cols;
+}
+
+static int is_right_available(BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int depth, max_depth = 4 - MIN(b_width_log2(bsize), b_height_log2(bsize));
+  int block[4] = {0};
+
+  if (bsize == BLOCK_64X64)
+    return 1;
+  mi_row = mi_row % 8;
+  mi_col = mi_col % 8;
+  for (depth = 1; depth <= max_depth; depth++) {
+    block[depth] = (mi_row >> (3 - depth)) * 2 + (mi_col >> (3 - depth));
+    mi_row = mi_row % (8 >> depth);
+    mi_col = mi_col % (8 >> depth);
+  }
+
+  if (b_width_log2(bsize) < b_height_log2(bsize)) {
+    if (block[max_depth] == 0)
+      return 1;
+  } else if (b_width_log2(bsize) > b_height_log2(bsize)) {
+    if (block[max_depth] > 0)
+      return 0;
+  } else {
+    if (block[max_depth] == 0 || block[max_depth] == 2)
+      return 1;
+    else if (block[max_depth] == 3)
+      return 0;
+  }
+
+  for (depth = max_depth - 1; depth > 0; depth--) {
+    if (block[depth] == 0 || block[depth] == 2)
+      return 1;
+    else if (block[depth] == 3)
+      return 0;
+  }
+  return 1;
+}
+
+static int is_second_rec(int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  int bw = 4 << b_width_log2(bsize);
+  int bh = 4 << b_height_log2(bsize);
+
+  if (bw < bh)
+    return (mi_col << 3) % (bw << 1) == 0 ? 0 : 1;
+  else if (bh < bw)
+    return (mi_row << 3) % (bh << 1) == 0 ? 0 : 2;
+  else
+    return 0;
+}
+
+int vp9_construct_ref_inter_list(VP9_COMMON *cm,  MACROBLOCKD *xd,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MB_MODE_INFO *ref_list[18]) {
+  int bw = 4 << b_width_log2(bsize);
+  int bh = 4 << b_height_log2(bsize);
+  int row_offset, col_offset;
+  int mi_offset;
+  MB_MODE_INFO *ref_mbmi;
+  int ref_index, ref_num = 0;
+  int row_offset_cand[18], col_offset_cand[18];
+  int offset_num = 0, i, switchflag;
+  int is_sec_rec = is_second_rec(mi_row, mi_col, bsize);
+
+  if (is_sec_rec != 2) {
+    row_offset_cand[offset_num] = -1; col_offset_cand[offset_num] = 0;
+    offset_num++;
+  }
+  if (is_sec_rec != 1) {
+    row_offset_cand[offset_num] = bh / 16; col_offset_cand[offset_num] = -1;
+    offset_num++;
+  }
+
+  row_offset = bh / 8 - 1;
+  col_offset = 1;
+  if (is_sec_rec < 2)
+    switchflag = 1;
+  else
+    switchflag = 0;
+  while ((is_sec_rec == 0 && ((row_offset >=0) || col_offset < (bw / 8 + 1))) ||
+         (is_sec_rec == 1 && col_offset < (bw / 8 + 1)) ||
+         (is_sec_rec == 2 && row_offset >=0)) {
+    switch (switchflag) {
+      case 0:
+        if (row_offset >= 0) {
+          if (row_offset != bh / 16) {
+            row_offset_cand[offset_num] = row_offset;
+            col_offset_cand[offset_num] = -1;
+            offset_num++;
+          }
+          row_offset--;
+        }
+        break;
+      case 1:
+        if (col_offset < (bw / 8 + 1)) {
+          row_offset_cand[offset_num] = -1;
+          col_offset_cand[offset_num] = col_offset;
+          offset_num++;
+          col_offset++;
+        }
+        break;
+      default:
+        assert(0);
+    }
+    if (is_sec_rec == 0)
+      switchflag = 1 - switchflag;
+  }
+  row_offset_cand[offset_num] = -1;
+  col_offset_cand[offset_num] = -1;
+  offset_num++;
+
+  for (i = 0; i < offset_num; i++) {
+    row_offset = row_offset_cand[i];
+    col_offset = col_offset_cand[i];
+    if ((col_offset < (bw / 8) ||
+        (col_offset == (bw / 8) && is_right_available(bsize, mi_row, mi_col)))
+        && check_inside(cm, mi_row + row_offset, mi_col + col_offset)) {
+      mi_offset = row_offset * cm->mi_stride + col_offset;
+      ref_mbmi = &xd->mi[mi_offset]->mbmi;
+      if (is_inter_block(ref_mbmi)) {
+        for (ref_index = 0; ref_index < ref_num; ref_index++) {
+          if (compare_interinfo(ref_mbmi, ref_list[ref_index]))
+            break;
+        }
+        if (ref_index == ref_num) {
+          ref_list[ref_num] = ref_mbmi;
+          ref_num++;
+        }
+      }
+    }
+  }
+  return ref_num;
+}
+#endif
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -220,6 +220,12 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   int block, int ref, int mi_row, int mi_col,
                                   int_mv *nearest, int_mv *near);

+#if CONFIG_COPY_CODING
+int vp9_construct_ref_inter_list(VP9_COMMON *cm,  MACROBLOCKD *xd,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MB_MODE_INFO *ref_list[18]);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -11,7 +11,6 @@
 #ifndef VP9_COMMON_VP9_ONYXC_INT_H_
 #define VP9_COMMON_VP9_ONYXC_INT_H_

-#include <pthread.h>
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vp9_rtcd.h"
@@ -62,40 +61,8 @@ typedef struct {
  int ref_count;
  vpx_codec_frame_buffer_t raw_frame_buffer;
  YV12_BUFFER_CONFIG buf;
-
-  // The Following variables will only be used in frame parallel decode.
-
-  // owner_thread_id indicates which FrameWorker owns this buffer. -1 means
-  // that no FrameWorker owns, or is decoding, this buffer.
-  int owner_worker_id;
-
-  // Buffer has been decoded to (row, col) position. When first start decoding,
-  // they are reset to -1. If a frame has been fully decoded, row and col will
-  // be set to INT_MAX.
-  int row;
-  int col;
 } RefCntBuffer;

-typedef struct {
-  // Protect BufferPool from being accessed by several FrameWorkers at
-  // the same time during frame parallel decode.
-  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t pool_mutex;
-#endif
-
-  // Private data associated with the frame buffer callbacks.
-  void *cb_priv;
-
-  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
-
-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
-
-  // Handles memory for the codec.
-  InternalFrameBufferList int_frame_buffers;
-} BufferPool;
-
 typedef struct VP9Common {
  struct vpx_internal_error_info  error;

@@ -122,10 +89,9 @@ typedef struct VP9Common {

  YV12_BUFFER_CONFIG *frame_to_show;

-  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];

-  // Prepare ref_frame_map for next frame. Only used in frame parallel decode.
-  int next_ref_frame_map[REF_FRAMES];
+  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */

  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
  // roll new_fb_idx into it.
@@ -236,33 +202,40 @@ typedef struct VP9Common {

  int log2_tile_cols, log2_tile_rows;

-  // External BufferPool passed from outside.
-  BufferPool *buffer_pool;
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;

  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;
+
+#if CONFIG_MASKED_INTERINTER
+  int use_masked_interinter;
+#endif
+#if CONFIG_INTERINTRA
+  int use_interintra;
+#if CONFIG_MASKED_INTERINTRA
+  int use_masked_interintra;
+#endif
+#endif
 } VP9_COMMON;

 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
+  return &cm->frame_bufs[cm->new_fb_idx].buf;
 }

 static INLINE int get_free_fb(VP9_COMMON *cm) {
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  int i;
-
-#if CONFIG_MULTITHREAD
-  pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
-#endif
  for (i = 0; i < FRAME_BUFFERS; i++)
-    if (frame_bufs[i].ref_count == 0)
+    if (cm->frame_bufs[i].ref_count == 0)
      break;

  assert(i < FRAME_BUFFERS);
-  frame_bufs[i].ref_count = 1;
-#if CONFIG_MULTITHREAD
-  pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
-#endif
+  cm->frame_bufs[i].ref_count = 1;
  return i;
 }

--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -383,3 +383,47 @@ int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
  return segment_id;
 }
+
+#if CONFIG_COPY_CODING
+int vp9_get_copy_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  if (has_above && has_left) {
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {
+      return 4;
+    } else if (above_intra || left_intra) {
+      return 3;
+    } else {
+      const int above_predict = above_mbmi->copy_mode != NOREF;
+      const int left_predict = left_mbmi->copy_mode != NOREF;
+      if (above_predict && left_predict)
+        return 0;
+      else if (above_predict || left_predict)
+        return 1;
+      else
+        return 2;
+    }
+  } else if (has_above || has_left) {
+    const MB_MODE_INFO *const ref_mbmi = has_above ? above_mbmi : left_mbmi;
+    const int ref_intra = !is_inter_block(ref_mbmi);
+
+    if (ref_intra) {
+      return 3;
+    } else {
+     const int ref_predict = ref_mbmi != NOREF;
+      if (ref_predict)
+        return 0;
+      else
+        return 1;
+    }
+  } else {
+    return 0;
+  }
+}
+#endif
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -134,6 +134,10 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
  }
 }

+#if CONFIG_COPY_CODING
+int vp9_get_copy_mode_context(const MACROBLOCKD *xd);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -65,6 +65,60 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                          const struct scale_factors *sf);

+#if CONFIG_MASKED_INTERINTER
+void vp9_generate_masked_weight(int mask_index, BLOCK_SIZE sb_type,
+                              int h, int w, uint8_t *mask, int stride);
+void vp9_generate_hard_mask(int mask_index, BLOCK_SIZE sb_type,
+                          int h, int w, uint8_t *mask, int stride);
+#endif
+
+#if CONFIG_SUPERTX
+void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                  int mi_row, int mi_col,
+                                                  int mi_row_ori,
+                                                  int mi_col_ori,
+                                                  BLOCK_SIZE top_bsize,
+                                                  PARTITION_TYPE partition);
+void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                   int mi_row, int mi_col,
+#endif
+                                                   int mi_row_ori,
+                                                   int mi_col_ori,
+                                                   BLOCK_SIZE top_bsize);
+void vp9_build_masked_inter_predictor_complex(uint8_t *dst, int dst_stride,
+                                              uint8_t *dst2, int dst2_stride,
+                                              int plane,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize,
+                                              BLOCK_SIZE top_bsize,
+                                              PARTITION_TYPE partition);
+void vp9_dec_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                      int mi_row, int mi_col,
+                                                      int mi_row_ori,
+                                                      int mi_col_ori,
+                                                      BLOCK_SIZE top_bsize,
+                                                      PARTITION_TYPE p);
+void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                       int mi_row, int mi_col,
+#endif
+                                                       int mi_row_ori,
+                                                       int mi_col_ori,
+                                                       BLOCK_SIZE top_bsize);
+#if CONFIG_MASKED_INTERINTER
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize);
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize);
+#endif
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -444,8 +444,227 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
  }
 }

+#if CONFIG_FILTERINTRA
+static void filter_intra_predictors_4tap(uint8_t *ypred_ptr, int y_stride,
+                                         int bs,
+                                         const uint8_t *yabove_row,
+                                         const uint8_t *yleft_col,
+                                         int mode) {
+  static const int prec_bits = 10;
+  static const int round_val = 511;
+
+  int k, r, c;
+  int pred[33][33];
+  int mean, ipred;
+
+  int taps4_4[10][4] = {
+      {735, 881, -537, -54},
+      {1005, 519, -488, -11},
+      {383, 990, -343, -6},
+      {442, 805, -542, 319},
+      {658, 616, -133, -116},
+      {875, 442, -141, -151},
+      {386, 741, -23, -80},
+      {390, 1027, -446, 51},
+      {679, 606, -523, 262},
+      {903, 922, -778, -23}
+  };
+  int taps4_8[10][4] = {
+      {648, 803, -444, 16},
+      {972, 620, -576, 7},
+      {561, 967, -499, -5},
+      {585, 762, -468, 144},
+      {596, 619, -182, -9},
+      {895, 459, -176, -153},
+      {557, 722, -126, -129},
+      {601, 839, -523, 105},
+      {562, 709, -499, 251},
+      {803, 872, -695, 43}
+  };
+  int taps4_16[10][4] = {
+      {423, 728, -347, 111},
+      {963, 685, -665, 23},
+      {281, 1024, -480, 216},
+      {640, 596, -437, 78},
+      {429, 669, -259, 99},
+      {740, 646, -415, 23},
+      {568, 771, -346, 40},
+      {404, 833, -486, 209},
+      {398, 712, -423, 307},
+      {939, 935, -887, 17}
+  };
+  int taps4_32[10][4] = {
+      {477, 737, -393, 150},
+      {881, 630, -546, 67},
+      {506, 984, -443, -20},
+      {114, 459, -270, 528},
+      {433, 528, 14, 3},
+      {837, 470, -301, -30},
+      {181, 777, 89, -107},
+      {-29, 716, -232, 259},
+      {589, 646, -495, 255},
+      {740, 884, -728, 77}
+  };
+
+  const int c1 = (bs >= 32) ? taps4_32[mode][0] : ((bs >= 16) ?
+      taps4_16[mode][0] : ((bs >= 8) ? taps4_8[mode][0] : taps4_4[mode][0]));
+  const int c2 = (bs >= 32) ? taps4_32[mode][1] : ((bs >= 16) ?
+      taps4_16[mode][1] : ((bs >= 8) ? taps4_8[mode][1] : taps4_4[mode][1]));
+  const int c3 = (bs >= 32) ? taps4_32[mode][2] : ((bs >= 16) ?
+      taps4_16[mode][2] : ((bs >= 8) ? taps4_8[mode][2] : taps4_4[mode][2]));
+  const int c4 = (bs >= 32) ? taps4_32[mode][3] : ((bs >= 16) ?
+      taps4_16[mode][3] : ((bs >= 8) ? taps4_8[mode][3] : taps4_4[mode][3]));
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)yleft_col[k];
+    mean = mean + (int)yabove_row[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; r++)
+    pred[r + 1][0] = (int)yleft_col[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; c++)
+    pred[0][c] = (int)yabove_row[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; r++)
+    for (c = 1; c < 2 * bs + 1 - r; c++) {
+      ipred = c1 * pred[r - 1][c] + c2 * pred[r][c - 1]
+                    + c3 * pred[r - 1][c - 1] + c4 * pred[r - 1][c + 1];
+      pred[r][c] = ipred < 0 ? -((-ipred + round_val) >> prec_bits) :
+                               ((ipred + round_val) >> prec_bits);
+    }
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++) {
+      ipred = pred[r + 1][c + 1] + mean;
+      ypred_ptr[c] = clip_pixel(ipred);
+    }
+    ypred_ptr += y_stride;
+  }
+}
+
+static void build_filter_intra_predictors(const MACROBLOCKD *xd,
+                                          const uint8_t *ref, int ref_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          PREDICTION_MODE mode, TX_SIZE tx_size,
+                                          int up_available, int left_available,
+                                          int right_available, int x, int y,
+                                          int plane) {
+  int i;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  vpx_memset(left_col, 129, 64);
+
+  // left
+  if (left_available) {
+    if (xd->mb_to_bottom_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (y0 + bs <= frame_height) {
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      } else {
+        const int extend_bottom = frame_height - y0;
+        for (i = 0; i < extend_bottom; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+        for (; i < bs; ++i)
+          left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      for (i = 0; i < bs; ++i)
+        left_col[i] = ref[i * ref_stride - 1];
+    }
+  }
+
+  // TODO(hkuang) do not extend 2*bs pixels for all modes.
+  // above
+  if (up_available) {
+    const uint8_t *above_ref = ref - ref_stride;
+    if (xd->mb_to_right_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (x0 + 2 * bs <= frame_width) {
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, 2 * bs);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 + bs <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      /* faster path if the block does not need extension */
+      if (bs == 4 && right_available && left_available) {
+        const_above_row = above_ref;
+      } else {
+        vpx_memcpy(above_row, above_ref, bs);
+        if (bs == 4 && right_available)
+          vpx_memcpy(above_row + bs, above_ref + bs, bs);
+        else
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        above_row[-1] = left_available ? above_ref[-1] : 129;
+      }
+    }
+  } else {
+    vpx_memset(above_row, 127, bs * 2);
+    above_row[-1] = 127;
+  }
+
+  // predict
+  filter_intra_predictors_4tap(dst, dst_stride, bs, const_above_row, left_col,
+                               mode);
+}
+#endif
+
 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, PREDICTION_MODE mode,
+#if CONFIG_FILTERINTRA
+                             int filterbit,
+#endif
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride,
                             int aoff, int loff, int plane) {
@@ -456,8 +675,708 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
  const int have_right = ((block_idx & wmask) != wmask);
  const int x = aoff * 4;
  const int y = loff * 4;
+#if CONFIG_FILTERINTRA
+  const int filterflag = is_filter_allowed(mode) && is_filter_enabled(tx_size)
+                         && filterbit;
+#endif

  assert(bwl >= 0);
+#if CONFIG_FILTERINTRA
+  if (!filterflag) {
+#endif
  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
                         have_top, have_left, have_right, x, y, plane);
+#if CONFIG_FILTERINTRA
+  } else {
+    build_filter_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+                        tx_size, have_top, have_left, have_right, x, y, plane);
+  }
+#endif
 }
+
+#if CONFIG_INTERINTRA
+#if CONFIG_MASKED_INTERINTRA
+#define MASK_WEIGHT_BITS_INTERINTRA 6
+
+static int get_masked_weight_interintra(int m) {
+  #define SMOOTHER_LEN_INTERINTRA  32
+  static const uint8_t smoothfn[2 * SMOOTHER_LEN_INTERINTRA + 1] = {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  1,  1,  1,
+      1,  1,  2,  2,  3,  4,  5,  6,
+      8,  9, 12, 14, 17, 21, 24, 28,
+      32,
+      36, 40, 43, 47, 50, 52, 55, 56,
+      58, 59, 60, 61, 62, 62, 63, 63,
+      63, 63, 63, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+  };
+  if (m < -SMOOTHER_LEN_INTERINTRA)
+    return 0;
+  else if (m > SMOOTHER_LEN_INTERINTRA)
+    return (1 << MASK_WEIGHT_BITS_INTERINTRA);
+  else
+    return smoothfn[m + SMOOTHER_LEN_INTERINTRA];
+}
+
+static int get_hard_mask_interintra(int m) {
+  return m > 0;
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
+// The soft mask is obtained by computing f(x, y) and then calling
+// get_masked_weight(f(x, y)).
+static const int mask_params_sml_interintra[1 << MASK_BITS_SML_INTERINTRA]
+                                            [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+};
+
+static const int mask_params_med_hgtw_interintra[1 << MASK_BITS_MED_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+};
+
+static const int mask_params_med_hltw_interintra[1 << MASK_BITS_MED_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+};
+
+static const int mask_params_med_heqw_interintra[1 << MASK_BITS_MED_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_hgtw_interintra[1 << MASK_BITS_BIG_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+};
+
+static const int mask_params_big_hltw_interintra[1 << MASK_BITS_BIG_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int mask_params_big_heqw_interintra[1 << MASK_BITS_BIG_INTERINTRA]
+                                                 [4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int *get_mask_params_interintra(int mask_index,
+                                             BLOCK_SIZE sb_type,
+                                             int h, int w) {
+  const int *a;
+  const int mask_bits = get_mask_bits_interintra(sb_type);
+
+  if (mask_index == MASK_NONE_INTERINTRA)
+    return NULL;
+
+  if (mask_bits == MASK_BITS_SML_INTERINTRA) {
+    a = mask_params_sml_interintra[mask_index];
+  } else if (mask_bits == MASK_BITS_MED_INTERINTRA) {
+    if (h > w)
+      a = mask_params_med_hgtw_interintra[mask_index];
+    else if (h < w)
+      a = mask_params_med_hltw_interintra[mask_index];
+    else
+      a = mask_params_med_heqw_interintra[mask_index];
+  } else if (mask_bits == MASK_BITS_BIG_INTERINTRA) {
+    if (h > w)
+      a = mask_params_big_hgtw_interintra[mask_index];
+    else if (h < w)
+      a = mask_params_big_hltw_interintra[mask_index];
+    else
+      a = mask_params_big_heqw_interintra[mask_index];
+  } else {
+    assert(0);
+  }
+  return a;
+}
+
+void vp9_generate_masked_weight_interintra(int mask_index,
+                                           BLOCK_SIZE sb_type,
+                                           int h, int w,
+                                           uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params_interintra(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight_interintra(m);
+    }
+}
+
+void vp9_generate_hard_mask_interintra(int mask_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_mask_params_interintra(mask_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_hard_mask_interintra(m);
+    }
+}
+#endif
+
+static void combine_interintra(PREDICTION_MODE mode,
+#if CONFIG_MASKED_INTERINTRA
+                               int use_masked_interintra,
+                               int mask_index,
+                               BLOCK_SIZE bsize,
+#endif
+                               uint8_t *comppred,
+                               int compstride,
+                               uint8_t *interpred,
+                               int interstride,
+                               uint8_t *intrapred,
+                               int intrastride,
+                               int bw, int bh) {
+  static const int scale_bits = 8;
+  static const int scale_max = 256;
+  static const int scale_round = 127;
+  static const int weights1d[64] = {
+      128, 125, 122, 119, 116, 114, 111, 109,
+      107, 105, 103, 101,  99,  97,  96,  94,
+       93,  91,  90,  89,  88,  86,  85,  84,
+       83,  82,  81,  81,  80,  79,  78,  78,
+       77,  76,  76,  75,  75,  74,  74,  73,
+       73,  72,  72,  71,  71,  71,  70,  70,
+       70,  70,  69,  69,  69,  69,  68,  68,
+       68,  68,  68,  67,  67,  67,  67,  67,
+  };
+
+  int size = MAX(bw, bh);
+  int size_scale = (size >= 64 ? 1 :
+                    size == 32 ? 2 :
+                    size == 16 ? 4 :
+                    size == 8  ? 8 : 16);
+  int i, j;
+
+#if CONFIG_MASKED_INTERINTRA
+  if (use_masked_interintra && get_mask_bits_interintra(bsize)) {
+    uint8_t mask[4096];
+    vp9_generate_masked_weight_interintra(mask_index, bsize, bh, bw, mask, bw);
+    for (i = 0; i < bh; ++i) {
+      for (j = 0; j < bw; ++j) {
+        int m = mask[i * bw + j];
+        comppred[i * compstride + j] =
+            (intrapred[i * intrastride + j] * m +
+            interpred[i * interstride + j] *
+            ((1 << MASK_WEIGHT_BITS_INTERINTRA) - m) +
+            (1 << (MASK_WEIGHT_BITS_INTERINTRA - 1))) >>
+            MASK_WEIGHT_BITS_INTERINTRA;
+      }
+    }
+    return;
+  }
+#endif
+
+  switch (mode) {
+    case V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = weights1d[i * size_scale];
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+               scale * intrapred[i * intrastride + j] + scale_round)
+               >> scale_bits;
+        }
+      }
+     break;
+
+    case H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = weights1d[j * size_scale];
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+               scale * intrapred[i * intrastride + j] + scale_round)
+               >> scale_bits;
+        }
+      }
+     break;
+
+    case D63_PRED:
+    case D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (weights1d[i * size_scale] * 3 +
+                       weights1d[j * size_scale]) >> 2;
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case D207_PRED:
+    case D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (weights1d[j * size_scale] * 3 +
+                       weights1d[i * size_scale]) >> 2;
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = weights1d[(i < j ? i : j) * size_scale];
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (weights1d[i * size_scale] +
+                       weights1d[j * size_scale]) >> 1;
+            comppred[i * compstride + j] =
+              ((scale_max - scale) * interpred[i * interstride + j] +
+                  scale * intrapred[i * intrastride + j] + scale_round)
+                  >> scale_bits;
+        }
+      }
+     break;
+
+    case TM_PRED:
+    case DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+            comppred[i * compstride + j] = (interpred[i * interstride + j] +
+                intrapred[i * intrastride + j]) >> 1;
+        }
+      }
+      break;
+  }
+}
+
+
+static void build_intra_predictors_for_2nd_block_interintra
+                                  (const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+                                   int up_available, int left_available,
+                                   int right_available, int bwltbh,
+                                   int x, int y, int plane) {
+  int i;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const uint8_t *ref_fi;
+  int ref_stride_fi;
+
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  once(init_intra_pred_fn_ptrs);
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  vpx_memset(left_col, 129, 64);
+
+  // left
+  if (left_available) {
+    if (bwltbh) {
+      ref_fi = ref;
+      ref_stride_fi = ref_stride;
+    } else {
+      ref_fi = dst;
+      ref_stride_fi = dst_stride;
+    }
+    if (xd->mb_to_bottom_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (y0 + bs <= frame_height) {
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref_fi[i * ref_stride_fi - 1];
+      } else {
+        const int extend_bottom = frame_height - y0;
+        assert(extend_bottom >= 0);
+        for (i = 0; i < extend_bottom; ++i)
+          left_col[i] = ref_fi[i * ref_stride_fi - 1];
+        for (; i < bs; ++i)
+          left_col[i] = ref_fi[(extend_bottom - 1) * ref_stride_fi - 1];
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      for (i = 0; i < bs; ++i)
+        left_col[i] = ref_fi[i * ref_stride_fi - 1];
+    }
+  }
+
+  // TODO(hkuang) do not extend 2*bs pixels for all modes.
+  // above
+  if (up_available) {
+    const uint8_t *above_ref;
+    if (bwltbh) {
+      ref_fi = dst;
+      ref_stride_fi = dst_stride;
+      above_row[-1] = left_available ? ref[-ref_stride-1] : 129;
+    } else {
+      ref_fi = ref;
+      ref_stride_fi = ref_stride;
+      above_row[-1] = ref[-ref_stride-1];
+    }
+    above_ref = ref_fi - ref_stride_fi;
+    if (xd->mb_to_right_edge < 0) {
+      /* slower path if the block needs border extension */
+      if (x0 + 2 * bs <= frame_width) {
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, 2 * bs);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 + bs <= frame_width) {
+        const int r = frame_width - x0;
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, bs);
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      } else if (x0 <= frame_width) {
+        const int r = frame_width - x0;
+        assert(r >= 0);
+        if (right_available && bs == 4) {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        } else {
+          vpx_memcpy(above_row, above_ref, r);
+          vpx_memset(above_row + r, above_row[r - 1],
+                     x0 + 2 * bs - frame_width);
+        }
+      }
+    } else {
+      /* faster path if the block does not need extension */
+      if (bs == 4 && right_available && left_available) {
+        const_above_row = above_ref;
+      } else {
+        vpx_memcpy(above_row, above_ref, bs);
+        if (bs == 4 && right_available)
+          vpx_memcpy(above_row + bs, above_ref + bs, bs);
+        else
+          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+      }
+    }
+  } else {
+    vpx_memset(above_row, 127, bs * 2);
+    above_row[-1] = 127;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
+                                                   const_above_row, left_col);
+  } else {
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
+  }
+}
+
+// Break down rectangular intra prediction for joint spatio-temporal prediction
+// into two square intra predictions.
+static void build_intra_predictors_for_interintra(MACROBLOCKD *xd,
+                                           uint8_t *src, int src_stride,
+                                           uint8_t *pred_ptr, int stride,
+                                           PREDICTION_MODE mode,
+                                           int bw, int bh,
+                                           int up_available, int left_available,
+                                           int right_available, int plane) {
+  if (bw == bh) {
+    build_intra_predictors(xd, src, src_stride, pred_ptr, stride,
+                           mode, intra_size_log2_for_interintra(bw),
+                           up_available, left_available, right_available,
+                           0, 0, plane);
+  } else if (bw < bh) {
+    uint8_t *src_bottom = src + bw * src_stride;
+    uint8_t *pred_ptr_bottom = pred_ptr + bw * stride;
+    build_intra_predictors(xd, src, src_stride, pred_ptr, stride,
+                           mode, intra_size_log2_for_interintra(bw),
+                           up_available, left_available, right_available,
+                           0, 0, plane);
+    build_intra_predictors_for_2nd_block_interintra(xd, src_bottom, src_stride,
+                           pred_ptr_bottom, stride,
+                           mode, intra_size_log2_for_interintra(bw),
+                           up_available, left_available, 0, 1,
+                           0, bw, plane);
+  } else {
+    uint8_t *src_right = src + bh;
+    uint8_t *pred_ptr_right = pred_ptr + bh;
+    build_intra_predictors(xd, src, src_stride, pred_ptr, stride,
+                           mode, intra_size_log2_for_interintra(bh),
+                           up_available, left_available, 1,
+                           0, 0, plane);
+    build_intra_predictors_for_2nd_block_interintra(xd, src_right, src_stride,
+                           pred_ptr_right, stride,
+                           mode, intra_size_log2_for_interintra(bh),
+                           up_available, left_available, right_available, 0,
+                           bh, 0, plane);
+  }
+}
+
+void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                         uint8_t *ypred,
+                                         int ystride,
+                                         BLOCK_SIZE bsize) {
+  int bw = 4 << b_width_log2(bsize);
+  int bh = 4 << b_height_log2(bsize);
+  uint8_t intrapredictor[4096];
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+      intrapredictor, bw,
+      xd->mi[0]->mbmi.interintra_mode, bw, bh,
+      xd->up_available, xd->left_available, 0, 0);
+  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
+#if CONFIG_MASKED_INTERINTRA
+                     xd->mi[0]->mbmi.use_masked_interintra,
+                     xd->mi[0]->mbmi.interintra_mask_index,
+                     bsize,
+#endif
+                     xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                     ypred, ystride, intrapredictor, bw, bw, bh);
+}
+
+void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          uint8_t *vpred,
+                                          int ustride, int vstride,
+                                          BLOCK_SIZE bsize) {
+  int bwl = b_width_log2(bsize), bw = 2 << bwl;
+  int bhl = b_height_log2(bsize), bh = 2 << bhl;
+  uint8_t uintrapredictor[1024];
+  uint8_t vintrapredictor[1024];
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[1].dst.buf, xd->plane[1].dst.stride,
+      uintrapredictor, bw,
+      xd->mi[0]->mbmi.interintra_uv_mode, bw, bh,
+      xd->up_available, xd->left_available, 0, 1);
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[2].dst.buf, xd->plane[1].dst.stride,
+      vintrapredictor, bw,
+      xd->mi[0]->mbmi.interintra_uv_mode, bw, bh,
+      xd->up_available, xd->left_available, 0, 2);
+  combine_interintra(xd->mi[0]->mbmi.interintra_uv_mode,
+#if CONFIG_MASKED_INTERINTRA
+                     xd->mi[0]->mbmi.use_masked_interintra,
+                     xd->mi[0]->mbmi.interintra_uv_mask_index,
+                     bsize,
+#endif
+                     xd->plane[1].dst.buf, xd->plane[1].dst.stride,
+                     upred, ustride, uintrapredictor, bw, bw, bh);
+  combine_interintra(xd->mi[0]->mbmi.interintra_uv_mode,
+#if CONFIG_MASKED_INTERINTRA
+                     xd->mi[0]->mbmi.use_masked_interintra,
+                     xd->mi[0]->mbmi.interintra_uv_mask_index,
+                     bsize,
+#endif
+                     xd->plane[2].dst.buf, xd->plane[2].dst.stride,
+                     vpred, vstride, vintrapredictor, bw, bw, bh);
+}
+
+void vp9_build_interintra_predictors(MACROBLOCKD *xd,
+                                     uint8_t *ypred,
+                                     uint8_t *upred,
+                                     uint8_t *vpred,
+                                     int ystride, int ustride, int vstride,
+                                     BLOCK_SIZE bsize) {
+  vp9_build_interintra_predictors_sby(xd, ypred, ystride, bsize);
+  vp9_build_interintra_predictors_sbuv(xd, upred, vpred,
+                                       ustride, vstride, bsize);
+}
+#endif
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -20,9 +20,37 @@ extern "C" {

 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, PREDICTION_MODE mode,
+#if CONFIG_FILTERINTRA
+                             int filterbit,
+#endif
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride,
                             int aoff, int loff, int plane);
+#if CONFIG_INTERINTRA
+void vp9_build_interintra_predictors(MACROBLOCKD *xd,
+                                     uint8_t *ypred,
+                                     uint8_t *upred,
+                                     uint8_t *vpred,
+                                     int ystride,
+                                     int ustride,
+                                     int vstride,
+                                     BLOCK_SIZE bsize);
+void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                         uint8_t *ypred,
+                                         int ystride,
+                                         BLOCK_SIZE bsize);
+void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          uint8_t *vpred,
+                                          int ustride, int vstride,
+                                          BLOCK_SIZE bsize);
+#if CONFIG_MASKED_INTERINTRA
+void vp9_generate_masked_weight_interintra(int mask_index,
+                                           BLOCK_SIZE sb_type,
+                                           int h, int w,
+                                           uint8_t *mask, int stride);
+#endif
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -506,6 +506,125 @@ specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";

+if ((vpx_config("CONFIG_MASKED_INTERINTER") eq "yes") || ((vpx_config("CONFIG_INTERINTRA") eq "yes") && (vpx_config("CONFIG_MASKED_INTERINTRA") eq "yes"))) {
+add_proto qw/unsigned int vp9_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance32x16/;
+
+add_proto qw/unsigned int vp9_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masdctked_variance16x32/;
+
+add_proto qw/unsigned int vp9_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance64x32/;
+
+add_proto qw/unsigned int vp9_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance32x64/;
+
+add_proto qw/unsigned int vp9_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance32x32/;
+
+add_proto qw/unsigned int vp9_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance64x64/;
+
+add_proto qw/unsigned int vp9_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance16x16/;
+
+add_proto qw/unsigned int vp9_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance16x8/;
+
+add_proto qw/unsigned int vp9_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance8x16/;
+
+add_proto qw/unsigned int vp9_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance8x8/;
+
+add_proto qw/unsigned int vp9_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance8x4/;
+
+add_proto qw/unsigned int vp9_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance4x8/;
+
+add_proto qw/unsigned int vp9_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_variance4x4/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance64x64/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance32x64/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance64x32/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance32x16/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance16x32/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance32x32/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance16x16/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance8x16/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance16x8/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance8x8/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance8x4/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance4x8/;
+
+add_proto qw/unsigned int vp9_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+specialize qw/vp9_masked_sub_pixel_variance4x4/;
+
+add_proto qw/unsigned int vp9_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad64x64/;
+
+add_proto qw/unsigned int vp9_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad32x64/;
+
+add_proto qw/unsigned int vp9_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad64x32/;
+
+add_proto qw/unsigned int vp9_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad32x16/;
+
+add_proto qw/unsigned int vp9_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad16x32/;
+
+add_proto qw/unsigned int vp9_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad32x32/;
+
+add_proto qw/unsigned int vp9_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad16x16/;
+
+add_proto qw/unsigned int vp9_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad16x8/;
+
+add_proto qw/unsigned int vp9_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad8x16/;
+
+add_proto qw/unsigned int vp9_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad8x8/;
+
+add_proto qw/unsigned int vp9_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad8x4/;
+
+add_proto qw/unsigned int vp9_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad4x8/;
+
+add_proto qw/unsigned int vp9_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+specialize qw/vp9_masked_sad4x4/;
+}
+
 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
 add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -714,9 +833,6 @@ specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vp9_subtract_block/, "$sse2_x86inc";

-add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
-
 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c
@@ -33,6 +33,14 @@ static int get_fixed_point_scale_factor(int other_size, int this_size) {
  return (other_size << REF_SCALE_SHIFT) / this_size;
 }

+static int check_scale_factors(int other_w, int other_h,
+                               int this_w, int this_h) {
+  return 2 * this_w >= other_w &&
+         2 * this_h >= other_h &&
+         this_w <= 16 * other_w &&
+         this_h <= 16 * other_h;
+}
+
 MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
@@ -46,7 +54,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                       int other_w, int other_h,
                                       int this_w, int this_h) {
-  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+  if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
    sf->x_scale_fp = REF_INVALID_SCALE;
    sf->y_scale_fp = REF_INVALID_SCALE;
    return;
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -50,14 +50,6 @@ static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }

-static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
-                                      int this_width, int this_height) {
-  return 2 * this_width >= ref_width &&
-         2 * this_height >= ref_height &&
-         this_width <= 16 * ref_width &&
-         this_height <= 16 * ref_height;
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -254,11 +254,24 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
                                            : mi->mbmi.uv_mode;
  int x, y;
  uint8_t *dst;
+#if CONFIG_FILTERINTRA
+  int fbit;
+  if (plane == 0)
+    if (mi->mbmi.sb_type < BLOCK_8X8)
+      fbit = mi->b_filter_info[block];
+    else
+      fbit = is_filter_enabled(tx_size) ? mi->mbmi.filterbit : 0;
+  else
+    fbit = is_filter_enabled(tx_size) ? mi->mbmi.uv_filterbit : 0;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
  dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];

  vp9_predict_intra_block(xd, block >> (tx_size << 1),
                          b_width_log2(plane_bsize), tx_size, mode,
+#if CONFIG_FILTERINTRA
+                          fbit,
+#endif
                          dst, pd->dst.stride, dst, pd->dst.stride,
                          x, y, plane);

@@ -322,6 +335,84 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  return &xd->mi[0]->mbmi;
 }

+#if CONFIG_SUPERTX
+static void set_offsets_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               const TileInfo *const tile,
+                               BLOCK_SIZE top_bsize,
+                               int mi_row, int mi_col,
+                               int mi_row_ori, int mi_col_ori) {
+  const int bw = num_8x8_blocks_wide_lookup[top_bsize];
+  const int bh = num_8x8_blocks_high_lookup[top_bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  set_mi_row_col(xd, tile, mi_row_ori, bh, mi_col_ori, bw,
+                 cm->mi_rows, cm->mi_cols);
+}
+
+static void set_mb_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                           const TileInfo *const tile,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x)
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+}
+
+static void set_offsets_topblock(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+}
+
+static void set_param_topblock(VP9_COMMON *const cm,  MACROBLOCKD *const xd,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+#if CONFIG_EXT_TX
+                              int txfm,
+#endif
+                              int skip) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  for (y = 0; y < y_mis; ++y)
+    for (x = 0; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
+#if CONFIG_EXT_TX
+      xd->mi[y * cm->mi_stride + x]->mbmi.ext_txfrm = txfm;
+#endif
+    }
+}
+#endif
+
 static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                    int idx, int mi_row, int mi_col) {
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -335,14 +426,246 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  xd->corrupted |= ref_buffer->buf->corrupted;
 }

+#if CONFIG_SUPERTX
+static void dec_predict_b_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 int mi_row, int mi_col,
+                                 int mi_row_ori, int mi_col_ori,
+                                 BLOCK_SIZE top_bsize) {
+  set_offsets_extend(cm, xd, tile, top_bsize, mi_row, mi_col,
+                     mi_row_ori, mi_col_ori);
+
+  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
+  xd->mi[0]->mbmi.tx_size = b_width_log2(top_bsize);
+#if !CONFIG_MASKED_INTERINTER
+  vp9_dec_build_inter_predictors_sb(xd, mi_row_ori, mi_col_ori, top_bsize);
+#else
+  vp9_dec_build_inter_predictors_sb_extend(xd, mi_row, mi_col,
+                                           mi_row_ori, mi_col_ori, top_bsize);
+#endif
+}
+
+static void dec_predict_b_sub8x8_extend(VP9_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        const TileInfo *const tile,
+                                        int mi_row, int mi_col,
+                                        int mi_row_ori, int mi_col_ori,
+                                        BLOCK_SIZE top_bsize,
+                                        PARTITION_TYPE partition) {
+  set_offsets_extend(cm, xd, tile, top_bsize, mi_row, mi_col,
+                     mi_row_ori, mi_col_ori);
+
+  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
+  xd->mi[0]->mbmi.tx_size = b_width_log2(top_bsize);
+  vp9_dec_build_inter_predictors_sby_sub8x8_extend(xd, mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   top_bsize, partition);
+  vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(xd,
+#if CONFIG_MASKED_INTERINTER
+                                                    mi_row, mi_col,
+#endif
+                                                    mi_row_ori, mi_col_ori,
+                                                    top_bsize);
+}
+
+static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                   const TileInfo *const tile,
+                                   int mi_row, int mi_col,
+                                   int mi_row_ori, int mi_col_ori,
+                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                                   uint8_t *dst_buf[3], int dst_stride[3]) {
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  MB_MODE_INFO *mbmi;
+  int i, offset = mi_row * cm->mi_stride + mi_col;
+
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, MAX_MB_PLANE * 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, MAX_MB_PLANE * 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf3, MAX_MB_PLANE * 32 * 32);
+  uint8_t *dst_buf1[3] = {tmp_buf1, tmp_buf1 + 32 * 32, tmp_buf1 + 2 * 32 * 32};
+  uint8_t *dst_buf2[3] = {tmp_buf2, tmp_buf2 + 32 * 32, tmp_buf2 + 2 * 32 * 32};
+  uint8_t *dst_buf3[3] = {tmp_buf3, tmp_buf3 + 32 * 32, tmp_buf3 + 2 * 32 * 32};
+  int dst_stride1[3] = {32, 32, 32};
+  int dst_stride2[3] = {32, 32, 32};
+  int dst_stride3[3] = {32, 32, 32};
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  mbmi = &xd->mi[0]->mbmi;
+  partition = partition_lookup[bsl][mbmi->sb_type];
+  subsize = get_subsize(bsize, partition);
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
+                           top_bsize);
+      break;
+    case PARTITION_HORZ:
+      if (bsize > BLOCK_8X8) {
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
+                             mi_col_ori, top_bsize);
+      } else {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      }
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = tmp_buf1 + i * 32 * 32;
+          xd->plane[i].dst.stride = 32;
+        }
+        dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
+                             mi_row_ori, mi_col_ori, top_bsize);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = dst_buf[i];
+          xd->plane[i].dst.stride = dst_stride[i];
+          vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                   dst_buf1[i], dst_stride1[i],
+                                                   i,
+                                                   mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   bsize, top_bsize,
+                                                   PARTITION_HORZ);
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize > BLOCK_8X8) {
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
+                             mi_col_ori, top_bsize);
+      } else {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      }
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = tmp_buf1 + i * 32 * 32;
+          xd->plane[i].dst.stride = 32;
+        }
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs, mi_row_ori,
+                             mi_col_ori, top_bsize);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = dst_buf[i];
+          xd->plane[i].dst.stride = dst_stride[i];
+          vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                   dst_buf1[i], dst_stride1[i],
+                                                   i,
+                                                   mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   bsize, top_bsize,
+                                                   PARTITION_VERT);
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      } else {
+        dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col,
+                               mi_row_ori, mi_col_ori, subsize, top_bsize,
+                               dst_buf, dst_stride);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col + hbs,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf1, dst_stride1);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf2, dst_stride2);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col + hbs,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf3, dst_stride3);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+            vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                     dst_buf1[i],
+                                                     dst_stride1[i],
+                                                     i, mi_row, mi_col,
+                                                     mi_row_ori, mi_col_ori,
+                                                     bsize, top_bsize,
+                                                     PARTITION_VERT);
+            if (mi_row + hbs < cm->mi_rows) {
+              vp9_build_masked_inter_predictor_complex(dst_buf2[i],
+                                                       dst_stride2[i],
+                                                       dst_buf3[i],
+                                                       dst_stride3[i],
+                                                       i, mi_row, mi_col,
+                                                       mi_row_ori, mi_col_ori,
+                                                       bsize, top_bsize,
+                                                       PARTITION_VERT);
+              vp9_build_masked_inter_predictor_complex(dst_buf[i],
+                                                       dst_stride[i],
+                                                       dst_buf2[i],
+                                                       dst_stride2[i],
+                                                       i, mi_row, mi_col,
+                                                       mi_row_ori, mi_col_ori,
+                                                       bsize, top_bsize,
+                                                       PARTITION_HORZ);
+            }
+          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+            vp9_build_masked_inter_predictor_complex(dst_buf[i],
+                                                     dst_stride[i],
+                                                     dst_buf2[i],
+                                                     dst_stride2[i],
+                                                     i, mi_row, mi_col,
+                                                     mi_row_ori, mi_col_ori,
+                                                     bsize, top_bsize,
+                                                     PARTITION_HORZ);
+          }
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif
+
 static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                         const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
                         int mi_row, int mi_col,
                         vp9_reader *r, BLOCK_SIZE bsize) {
  const int less8x8 = bsize < BLOCK_8X8;
+#if !CONFIG_SUPERTX
  MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+#else
+  MB_MODE_INFO *mbmi;
+  if (!supertx_enabled) {
+    mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  } else {
+    set_mb_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  }
+#endif
+  vp9_read_mode_info(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r);

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  if (less8x8)
    bsize = BLOCK_8X8;

@@ -376,6 +699,9 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
        mbmi->skip = 1;  // skip loopfilter
    }
  }
+#if CONFIG_SUPERTX
+  }
+#endif

  xd->corrupted |= vp9_reader_has_error(r);
 }
@@ -406,49 +732,161 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,

 static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                             int read_token, int supertx_enabled,
+#endif
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize) {
  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize, uv_subsize;
+  BLOCK_SIZE subsize;
+#if CONFIG_SUPERTX
+  int skip = 0;
+#if CONFIG_EXT_TX
+  int txfm = 0;
+#endif
+#endif

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
  subsize = get_subsize(bsize, partition);
-  uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y];
-  if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Invalid block size.");
+#if CONFIG_SUPERTX
+  if (cm->frame_type != KEY_FRAME &&
+      partition != PARTITION_NONE &&
+      bsize <= BLOCK_32X32 &&
+      !supertx_enabled) {
+    TX_SIZE supertx_size = b_width_log2(bsize);
+    if (partition == PARTITION_SPLIT) {
+      supertx_enabled = vp9_read(r, cm->fc.supertxsplit_prob[supertx_size]);
+      cm->counts.supertxsplit[supertx_size][supertx_enabled]++;
+    } else {
+      supertx_enabled = vp9_read(r, cm->fc.supertx_prob[supertx_size]);
+      cm->counts.supertx[supertx_size][supertx_enabled]++;
+    }
+  }
+  if (supertx_enabled && read_token) {
+    int offset = mi_row * cm->mi_stride + mi_col;
+    xd->mi = cm->mi_grid_visible + offset;
+    xd->mi[0] = &cm->mi[offset];
+    set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize],
+                   mi_col, num_8x8_blocks_wide_lookup[bsize],
+                   cm->mi_rows, cm->mi_cols);
+    set_skip_context(xd, mi_row, mi_col);
+    // Here we assume mbmi->segment_id = 0
+    skip = read_skip(cm, xd, 0, r);
+    if (skip)
+      reset_skip_context(xd, bsize);
+#if CONFIG_EXT_TX
+    if (bsize <= BLOCK_16X16 && !skip) {
+      txfm = vp9_read(r, cm->fc.ext_tx_prob);
+      if (!cm->frame_parallel_decoding_mode)
+        ++cm->counts.ext_tx[txfm];
+    }
+#endif
+  }
+#endif
  if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                 supertx_enabled,
+#endif
+                 mi_row, mi_col, r, subsize);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        break;
      case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif
+                       mi_row + hbs, mi_col, r, subsize);
        break;
      case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif
+                       mi_row, mi_col + hbs, r, subsize);
        break;
      case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row,       mi_col,       r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row,       mi_col + hbs, r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row + hbs, mi_col,       r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row + hbs, mi_col + hbs, r, subsize);
        break;
      default:
        assert(0 && "Invalid partition type");
    }
  }

+#if CONFIG_SUPERTX
+  if (supertx_enabled && read_token) {
+    uint8_t *dst_buf[3];
+    int dst_stride[3], i;
+
+    vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      dst_buf[i] = xd->plane[i].dst.buf;
+      dst_stride[i] = xd->plane[i].dst.stride;
+    }
+    dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col, mi_row, mi_col,
+                           bsize, bsize, dst_buf, dst_stride);
+
+    if (!skip) {
+      int eobtotal = 0;
+      struct inter_args arg = { cm, xd, r, &eobtotal };
+      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
+#if CONFIG_EXT_TX
+      xd->mi[0]->mbmi.ext_txfrm = txfm;
+#endif
+      vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      if (!(subsize < BLOCK_8X8) && eobtotal == 0)
+        skip = 1;
+    }
+    set_param_topblock(cm, xd, bsize, mi_row, mi_col,
+#if CONFIG_EXT_TX
+                       txfm,
+#endif
+                       skip);
+  }
+#endif
+
  // update partition context
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
@@ -621,7 +1059,6 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
 }

 static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
-  BufferPool *const pool = cm->buffer_pool;
  if (cm->width != width || cm->height != height) {
    // Change in frame size.
    // TODO(agrange) Don't test width/height, check overall size.
@@ -641,8 +1078,8 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
  if (vp9_realloc_frame_buffer(
          get_frame_new_buffer(cm), cm->width, cm->height,
          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
-          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
-          pool->cb_priv)) {
+          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
+          cm->cb_priv)) {
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
@@ -672,17 +1109,9 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
  if (!found)
    read_frame_size(rb, &width, &height);

-  // Check that each of the frames that this frame references has valid
-  // dimensions.
-  for (i = 0; i < REFS_PER_FRAME; ++i) {
-    RefBuffer *const ref_frame = &cm->frame_refs[i];
-    const int ref_width = ref_frame->buf->y_width;
-    const int ref_height = ref_frame->buf->y_height;
-
-    if (!valid_ref_frame_size(ref_width, ref_height, width, height))
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Referenced frame has invalid size");
-  }
+  if (width <= 0 || height <= 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Referenced frame with invalid size");

  apply_frame_size(cm, width, height);
  setup_display_size(cm, rb);
@@ -772,7 +1201,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
                                   const uint8_t *data,
                                   const uint8_t *data_end) {
  VP9_COMMON *const cm = &pbi->common;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
  const int tile_cols = 1 << cm->log2_tile_cols;
  const int tile_rows = 1 << cm->log2_tile_rows;
@@ -785,7 +1213,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                    vpx_memalign(32, sizeof(LFWorkerData)));
    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
-    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
+    if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Loop filter thread creation failed");
    }
@@ -856,7 +1284,11 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        vp9_zero(tile_data->xd.left_seg_context);
        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
             mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
+          decode_partition(tile_data->cm, &tile_data->xd, &tile,
+#if CONFIG_SUPERTX
+                           1, 0,
+#endif
+                           mi_row, mi_col,
                           &tile_data->bit_reader, BLOCK_64X64);
        }
      }
@@ -871,13 +1303,13 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        // decoding has completed: finish up the loop filter in this thread.
        if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;

-        winterface->sync(&pbi->lf_worker);
+        vp9_worker_sync(&pbi->lf_worker);
        lf_data->start = lf_start;
        lf_data->stop = mi_row;
        if (pbi->max_threads > 1) {
-          winterface->launch(&pbi->lf_worker);
+          vp9_worker_launch(&pbi->lf_worker);
        } else {
-          winterface->execute(&pbi->lf_worker);
+          vp9_worker_execute(&pbi->lf_worker);
        }
      }
    }
@@ -886,10 +1318,10 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
  // Loopfilter remaining rows in the frame.
  if (cm->lf.filter_level) {
    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-    winterface->sync(&pbi->lf_worker);
+    vp9_worker_sync(&pbi->lf_worker);
    lf_data->start = lf_data->stop;
    lf_data->stop = cm->mi_rows;
-    winterface->execute(&pbi->lf_worker);
+    vp9_worker_execute(&pbi->lf_worker);
  }

  // Get last tile data.
@@ -910,6 +1342,9 @@ static int tile_worker_hook(void *arg1, void *arg2) {
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE) {
      decode_partition(tile_data->cm, &tile_data->xd, tile,
+#if CONFIG_SUPERTX
+                       1, 0,
+#endif
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
    }
  }
@@ -933,7 +1368,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
                                      const uint8_t *data,
                                      const uint8_t *data_end) {
  VP9_COMMON *const cm = &pbi->common;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  const uint8_t *bit_reader_end = NULL;
  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -960,11 +1394,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
      VP9Worker *const worker = &pbi->tile_workers[i];
      ++pbi->num_tile_workers;

-      winterface->init(worker);
+      vp9_worker_init(worker);
      CHECK_MEM_ERROR(cm, worker->data1,
                      vpx_memalign(32, sizeof(TileWorkerData)));
      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
-      if (i < num_threads - 1 && !winterface->reset(worker)) {
+      if (i < num_threads - 1 && !vp9_worker_reset(worker)) {
        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                           "Tile decoder thread creation failed");
      }
@@ -1027,9 +1461,9 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,

      worker->had_error = 0;
      if (i == num_workers - 1 || n == tile_cols - 1) {
-        winterface->execute(worker);
+        vp9_worker_execute(worker);
      } else {
-        winterface->launch(worker);
+        vp9_worker_launch(worker);
      }

      if (buf->col == tile_cols - 1) {
@@ -1041,7 +1475,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,

    for (; i > 0; --i) {
      VP9Worker *const worker = &pbi->tile_workers[i - 1];
-      pbi->mb.corrupted |= !winterface->sync(worker);
+      pbi->mb.corrupted |= !vp9_worker_sync(worker);
    }
    if (final_worker > -1) {
      TileWorkerData *const tile_data =
@@ -1077,9 +1511,8 @@ static BITSTREAM_PROFILE read_profile(struct vp9_read_bit_buffer *rb) {
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                       struct vp9_read_bit_buffer *rb) {
  VP9_COMMON *const cm = &pbi->common;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-  int mask, i, ref_index = 0;
  size_t sz;
+  int i;

  cm->last_frame_type = cm->frame_type;

@@ -1097,18 +1530,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
    // Show an existing frame directly.
    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];

-#if CONFIG_MULTITHREAD
-    pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
-#endif
-    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1)
+    if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1)
      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                         "Buffer %d does not contain a decoded frame",
                         frame_to_show);

-    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-#if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
-#endif
+    ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
    pbi->refresh_frame_flags = 0;
    cm->lf.filter_level = 0;
    cm->show_frame = 1;
@@ -1164,12 +1591,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
      setup_frame_size(cm, rb);
    } else {
      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+
      for (i = 0; i < REFS_PER_FRAME; ++i) {
        const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
        const int idx = cm->ref_frame_map[ref];
-        RefBuffer *const ref_frame = &cm->frame_refs[i];
-        ref_frame->idx = idx;
-        ref_frame->buf = &frame_bufs[idx].buf;
+        cm->frame_refs[i].idx = idx;
+        cm->frame_refs[i].buf = &cm->frame_bufs[idx].buf;
        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
      }

@@ -1204,28 +1631,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
  // below, forcing the use of context 0 for those frame types.
  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);

-  // Update next_ref_frame_map in frame parallel decode.
-  if (pbi->frame_parallel_decode) {
-    for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-      if (mask & 1) {
-        cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
-#if CONFIG_MULTITHREAD
-        pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
-#endif
-        ++cm->buffer_pool->frame_bufs[cm->new_fb_idx].ref_count;
-#if CONFIG_MULTITHREAD
-        pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
-#endif
-      } else {
-        cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
-      }
-      ++ref_index;
-    }
-
-    for (; ref_index < REF_FRAMES; ++ref_index)
-      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
-  }
-
  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
    vp9_setup_past_independence(cm);

@@ -1290,6 +1695,62 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
        vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);

    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
+
+#if CONFIG_EXT_TX
+    vp9_diff_update_prob(&r, &fc->ext_tx_prob);
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      cm->use_masked_interinter = vp9_read_bit(&r);
+      if (cm->use_masked_interinter) {
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (get_mask_bits(i))
+            vp9_diff_update_prob(&r, &fc->masked_interinter_prob[i]);
+        }
+      }
+    } else {
+      cm->use_masked_interinter = 0;
+    }
+#endif
+
+#if CONFIG_INTERINTRA
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      cm->use_interintra = vp9_read_bit(&r);
+      if (cm->use_interintra) {
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (is_interintra_allowed(i)) {
+            vp9_diff_update_prob(&r, &fc->interintra_prob[i]);
+          }
+        }
+#if CONFIG_MASKED_INTERINTRA
+        cm->use_masked_interintra = vp9_read_bit(&r);
+        if (cm->use_masked_interintra) {
+          for (i = 0; i < BLOCK_SIZES; i++) {
+            if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
+              vp9_diff_update_prob(&r, &fc->masked_interintra_prob[i]);
+          }
+        }
+      } else {
+        cm->use_masked_interintra = 0;
+#endif
+      }
+    } else {
+      cm->use_interintra = 0;
+#if CONFIG_MASKED_INTERINTRA
+      cm->use_masked_interintra = 0;
+#endif
+    }
+#endif
+
+#if CONFIG_COPY_CODING
+    for (j = 0; j < COPY_MODE_CONTEXTS; j++) {
+      for (i = 0; i < 1; i++)
+        vp9_diff_update_prob(&r, &fc->copy_mode_probs_l2[j][i]);
+      for (i = 0; i < 2; i++)
+        vp9_diff_update_prob(&r, &fc->copy_mode_probs[j][i]);
+    }
+#endif
  }

  return vp9_reader_has_error(&r);
@@ -1341,6 +1802,10 @@ static void debug_check_frame_counts(const VP9_COMMON *const cm) {
  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
  assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
  assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
+#if CONFIG_EXT_TX
+  assert(!memcmp(cm->counts.ext_tx, zero_counts.ext_tx,
+                 sizeof(cm->counts.ext_tx)));
+#endif
 }
 #endif  // NDEBUG

@@ -1370,8 +1835,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
                      const uint8_t **p_data_end) {
  VP9_COMMON *const cm = &pbi->common;
  MACROBLOCKD *const xd = &pbi->mb;
-  struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
-
+  struct vp9_read_bit_buffer rb = { 0 };
  uint8_t clear_data[MAX_VP9_HEADER_SIZE];
  const size_t first_partition_size = read_uncompressed_header(pbi,
      init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
@@ -1422,17 +1886,6 @@ void vp9_decode_frame(VP9Decoder *pbi,

  new_fb->corrupted |= xd->corrupted;

-  // Update progress in frame parallel decode.
-  if (pbi->frame_parallel_decode) {
-    VP9Worker *worker = pbi->owner_frame_worker;
-    FrameWorkerData *const worker_data = worker->data1;
-    pthread_mutex_lock(&worker_data->stats_mutex);
-    pbi->cur_buf->row = INT_MAX;
-    pbi->cur_buf->col = INT_MAX;
-    pthread_cond_signal(&worker_data->stats_cond);
-    pthread_mutex_unlock(&worker_data->stats_mutex);
-  }
-
  if (!new_fb->corrupted) {
    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
      vp9_adapt_coef_probs(cm);
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -54,6 +54,36 @@ static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) {
  return NEARESTMV + mode;
 }

+#if CONFIG_COPY_CODING
+static COPY_MODE read_copy_mode(VP9_COMMON *cm, vp9_reader *r,
+                                int num_candidate, int ctx) {
+  COPY_MODE mode;
+
+  switch (num_candidate) {
+    case 0:
+      assert(0);
+      break;
+    case 1:
+      mode = REF0;
+      break;
+    case 2:
+      mode = REF0 + vp9_read_tree(r, vp9_copy_mode_tree_l2,
+                                  cm->fc.copy_mode_probs_l2[ctx]);
+      if (!cm->frame_parallel_decoding_mode)
+          ++cm->counts.copy_mode_l2[ctx][mode - REF0];
+      break;
+    default:
+      mode = REF0 + vp9_read_tree(r, vp9_copy_mode_tree,
+                                  cm->fc.copy_mode_probs[ctx]);
+      if (!cm->frame_parallel_decoding_mode)
+          ++cm->counts.copy_mode[ctx][mode - REF0];
+      break;
+  }
+
+  return mode;
+}
+#endif
+
 static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }
@@ -144,7 +174,11 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  return segment_id;
 }

+#if CONFIG_SUPERTX
+int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+#else
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+#endif
                     int segment_id, vp9_reader *r) {
  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
    return 1;
@@ -175,29 +209,85 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,

  switch (bsize) {
    case BLOCK_4X4:
+#if !CONFIG_FILTERINTRA
      for (i = 0; i < 4; ++i)
+#else
+      for (i = 0; i < 4; ++i) {
+#endif
        mi->bmi[i].as_mode =
            read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i));
+#if CONFIG_FILTERINTRA
+        if (is_filter_allowed(mi->bmi[i].as_mode))
+          mi->b_filter_info[i] =
+              vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[i].as_mode]);
+        else
+          mi->b_filter_info[i] = 0;
+      }
+      mbmi->filterbit = mi->b_filter_info[3];
+#endif
      mbmi->mode = mi->bmi[3].as_mode;
      break;
    case BLOCK_4X8:
      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode))
+        mi->b_filter_info[0] = mi->b_filter_info[2] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+      else
+        mi->b_filter_info[0] = mi->b_filter_info[2] = 0;
+#endif
      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[1].as_mode))
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[1].as_mode]);
+      else
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+#endif
      break;
    case BLOCK_8X4:
      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode))
+        mi->b_filter_info[0] = mi->b_filter_info[1] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+      else
+        mi->b_filter_info[0] = mi->b_filter_info[1] = 0;
+#endif
      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2));
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[2].as_mode))
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[2].as_mode]);
+      else
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+#endif
      break;
    default:
      mbmi->mode = read_intra_mode(r,
                                   get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+      if (is_filter_enabled(mbmi->tx_size) && is_filter_allowed(mbmi->mode))
+        mbmi->filterbit = vp9_read(r,
+                            cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode]);
+      else
+        mbmi->filterbit = 0;
+#endif
  }

  mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
+#if CONFIG_FILTERINTRA
+  if (is_filter_enabled(get_uv_tx_size(mbmi)) &&
+      is_filter_allowed(mbmi->uv_mode))
+    mbmi->uv_filterbit = vp9_read(r,
+        cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+  else
+    mbmi->uv_filterbit = 0;
+#endif
 }

 static int read_mv_component(vp9_reader *r,
@@ -335,25 +425,97 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,

  switch (bsize) {
    case BLOCK_4X4:
+#if !CONFIG_FILTERINTRA
      for (i = 0; i < 4; ++i)
+#else
+      for (i = 0; i < 4; ++i) {
+#endif
        mi->bmi[i].as_mode = read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+        if (is_filter_allowed(mi->bmi[i].as_mode)) {
+          mi->b_filter_info[i] =
+              vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[i].as_mode]);
+          cm->counts.filterintra[0][mi->bmi[i].as_mode]
+                                   [mi->b_filter_info[i]]++;
+        } else {
+          mi->b_filter_info[i] = 0;
+        }
+      }
+      mbmi->filterbit = mi->b_filter_info[3];
+#endif
      mbmi->mode = mi->bmi[3].as_mode;
      break;
    case BLOCK_4X8:
      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode)) {
+        mi->b_filter_info[0] = mi->b_filter_info[2] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[0].as_mode][mi->b_filter_info[0]]++;
+      } else {
+        mi->b_filter_info[0] = mi->b_filter_info[2] = 0;
+      }
+#endif
      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[1].as_mode)) {
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[1].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[1].as_mode][mi->b_filter_info[1]]++;
+      } else {
+        mi->b_filter_info[1] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+      }
+#endif
      break;
    case BLOCK_8X4:
      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[0].as_mode)) {
+        mi->b_filter_info[0] = mi->b_filter_info[1] =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[0].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[0].as_mode][mi->b_filter_info[0]]++;
+      } else {
+        mi->b_filter_info[0] = mi->b_filter_info[1] = 0;
+      }
+#endif
      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
          read_intra_mode_y(cm, r, 0);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mi->bmi[2].as_mode)) {
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit =
+            vp9_read(r, cm->fc.filterintra_prob[0][mi->bmi[2].as_mode]);
+        cm->counts.filterintra[0][mi->bmi[2].as_mode][mi->b_filter_info[2]]++;
+      } else {
+        mi->b_filter_info[2] = mi->b_filter_info[3] = mbmi->filterbit = 0;
+      }
+#endif
      break;
    default:
      mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mbmi->mode) && is_filter_enabled(mbmi->tx_size)) {
+        mbmi->filterbit = vp9_read(r,
+            cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode]);
+        cm->counts.filterintra[mbmi->tx_size][mbmi->mode][mbmi->filterbit]++;
+      } else {
+        mbmi->filterbit = 0;
+      }
+#endif
  }

  mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+#if CONFIG_FILTERINTRA
+  if (is_filter_allowed(mbmi->uv_mode) &&
+      is_filter_enabled(get_uv_tx_size(mbmi))) {
+    mbmi->uv_filterbit = vp9_read(r,
+        cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+    cm->counts.filterintra[get_uv_tx_size(mbmi)]
+                           [mbmi->uv_mode][mbmi->uv_filterbit]++;
+  } else {
+    mbmi->uv_filterbit = 0;
+  }
+#endif
 }

 static INLINE int is_mv_valid(const MV *mv) {
@@ -422,6 +584,9 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
                                       MODE_INFO *const mi,
+#if CONFIG_SUPERTX && CONFIG_EXT_TX
+                                       int supertx_enabled,
+#endif
                                       int mi_row, int mi_col, vp9_reader *r) {
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -464,6 +629,37 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
                      ? read_switchable_interp_filter(cm, xd, r)
                      : cm->interp_filter;

+#if CONFIG_INTERINTRA
+    if ((cm->use_interintra) &&
+        is_interintra_allowed(bsize) &&
+        is_inter_mode(mbmi->mode) &&
+        (mbmi->ref_frame[1] <= INTRA_FRAME)) {
+      mbmi->ref_frame[1] = vp9_read(r, cm->fc.interintra_prob[bsize]) ?
+                           INTRA_FRAME : NONE;
+      cm->counts.interintra[bsize][mbmi->ref_frame[1] == INTRA_FRAME]++;
+#if CONFIG_MASKED_INTERINTRA
+      mbmi->use_masked_interintra = 0;
+#endif
+      if (mbmi->ref_frame[1] == INTRA_FRAME) {
+        mbmi->interintra_mode =
+            read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+        mbmi->interintra_uv_mode = mbmi->interintra_mode;
+#if CONFIG_MASKED_INTERINTRA
+        if (cm->use_masked_interintra && get_mask_bits_interintra(bsize)) {
+          mbmi->use_masked_interintra = vp9_read(r,
+                                          cm->fc.masked_interintra_prob[bsize]);
+          cm->counts.masked_interintra[bsize][mbmi->use_masked_interintra]++;
+          if (mbmi->use_masked_interintra) {
+            mbmi->interintra_mask_index = vp9_read_literal(r,
+                                               get_mask_bits_interintra(bsize));
+            mbmi->interintra_uv_mask_index = mbmi->interintra_mask_index;
+          }
+        }
+#endif
+      }
+    }
+#endif
+
  if (bsize < BLOCK_8X8) {
    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
@@ -508,35 +704,160 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
                                nearestmv, nearmv, is_compound, allow_hp, r);
  }
+#if CONFIG_MASKED_INTERINTER
+  mbmi->use_masked_interinter = 0;
+  if (cm->use_masked_interinter &&
+      cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_mode(mbmi->mode) &&
+      get_mask_bits(bsize) &&
+      mbmi->ref_frame[1] > INTRA_FRAME) {
+    mbmi->use_masked_interinter =
+        vp9_read(r, cm->fc.masked_interinter_prob[bsize]);
+    cm->counts.masked_interinter[bsize][mbmi->use_masked_interinter]++;
+    if (mbmi->use_masked_interinter) {
+      mbmi->mask_index = vp9_read_literal(r, get_mask_bits(bsize));
+    }
+  }
+#endif
 }

 static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                                       int supertx_enabled,
+#endif
                                       int mi_row, int mi_col, vp9_reader *r) {
  MODE_INFO *const mi = xd->mi[0];
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  int inter_block;
+#if CONFIG_COPY_CODING
+  int num_candidate = 0;
+  MB_MODE_INFO *inter_ref_list[18] = {NULL};
+#endif

  mbmi->mv[0].as_int = 0;
  mbmi->mv[1].as_int = 0;
+
+#if CONFIG_COPY_CODING
+  if (mbmi->sb_type >= BLOCK_8X8)
+    num_candidate = vp9_construct_ref_inter_list(cm, xd, mbmi->sb_type,
+                                               mi_row, mi_col, inter_ref_list);
+  if (mbmi->sb_type >= BLOCK_8X8 && num_candidate > 0) {
+    int ctx = vp9_get_copy_mode_context(xd);
+    int is_copy = vp9_read(r, cm->fc.copy_noref_prob[ctx][mbmi->sb_type]);
+
+    ++cm->counts.copy_noref[ctx][mbmi->sb_type][is_copy];
+    if (!is_copy) {
+      mbmi->copy_mode = NOREF;
+    } else {
+      mbmi->copy_mode = read_copy_mode(cm, r, num_candidate, ctx);
+    }
+  } else {
+    mbmi->copy_mode = NOREF;
+  }
+  if (mbmi->copy_mode != NOREF) {
+    BLOCK_SIZE bsize_backup = mbmi->sb_type;
+    int skip_backup = mbmi->skip;
+    COPY_MODE copy_mode_backup = mbmi->copy_mode;
+#if CONFIG_SUPERTX
+    TX_SIZE tx_size_backup = mbmi->tx_size;
+#endif
+#if CONFIG_EXT_TX
+    EXT_TX_TYPE ext_txfrm_backup = mbmi->ext_txfrm;
+#endif
+
+    inter_block = 1;
+    *mbmi = *inter_ref_list[mbmi->copy_mode - REF0];
+#if CONFIG_MASKED_INTERINTER
+    mbmi->use_masked_interinter = 0;
+#endif
+#if CONFIG_INTERINTRA
+    if (mbmi->ref_frame[1] == INTRA_FRAME)
+      mbmi->ref_frame[1] = NONE;
+#endif
+#if CONFIG_SUPERTX
+    mbmi->tx_size = tx_size_backup;
+#endif
+#if CONFIG_EXT_TX
+    mbmi->ext_txfrm = ext_txfrm_backup;
+#endif
+    mbmi->sb_type = bsize_backup;
+    mbmi->mode = NEARESTMV;
+    mbmi->skip = skip_backup;
+    mbmi->copy_mode = copy_mode_backup;
+  }
+#endif
+
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_COPY_CODING
+  if (mbmi->copy_mode == NOREF)
+#endif
  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
                               !mbmi->skip || !inter_block, r);
+#if CONFIG_EXT_TX
+  if (inter_block &&
+      mbmi->tx_size <= TX_16X16 &&
+      mbmi->sb_type >= BLOCK_8X8 &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
+      !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+      !mbmi->skip) {
+    mbmi->ext_txfrm = vp9_read(r, cm->fc.ext_tx_prob);
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.ext_tx[mbmi->ext_txfrm];
+  } else {
+    mbmi->ext_txfrm = NORM;
+  }
+#endif
+#if CONFIG_SUPERTX
+  } else {
+    const int ctx = vp9_get_intra_inter_context(xd);
+    mbmi->segment_id = 0;
+    inter_block = 1;
+    if (!cm->frame_parallel_decoding_mode)
+#if CONFIG_COPY_CODING
+      if (mbmi->copy_mode == NOREF)
+#endif
+      ++cm->counts.intra_inter[ctx][1];
+  }
+#endif

+#if CONFIG_COPY_CODING
+  if (mbmi->copy_mode == NOREF) {
+#endif
  if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(cm, xd, tile, mi,
+#if CONFIG_SUPERTX && CONFIG_EXT_TX
+                               supertx_enabled,
+#endif
+                               mi_row, mi_col, r);
  else
    read_intra_block_mode_info(cm, mi, r);
+#if CONFIG_COPY_CODING
+  }
+#endif
 }

 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                        const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
                        int mi_row, int mi_col, vp9_reader *r) {
  if (frame_is_intra_only(cm))
    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
  else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(cm, xd, tile,
+#if CONFIG_SUPERTX
+                               supertx_enabled,
+#endif
+                               mi_row, mi_col, r);
 }
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -21,8 +21,16 @@ struct TileInfo;

 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                        const struct TileInfo *const tile,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
                        int mi_row, int mi_col, vp9_reader *r);

+#if CONFIG_SUPERTX
+int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+              int segment_id, vp9_reader *r);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -41,7 +41,7 @@ static void initialize_dec() {
  }
 }

-VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
+VP9Decoder *vp9_decoder_create() {
  VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;

@@ -63,11 +63,9 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {

  // Initialize the references to not point to any frame buffers.
  vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
-  vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));

  cm->current_video_frame = 0;
  pbi->ready_for_new_data = 1;
-  pbi->common.buffer_pool = pool;

  // vp9_init_dequantizer() is first called here. Add check in
  // frame_init_dequantizer() to avoid unnecessary calling of
@@ -78,7 +76,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {

  cm->error.setjmp = 0;

-  vp9_get_worker_interface()->init(&pbi->lf_worker);
+  vp9_worker_init(&pbi->lf_worker);

  return pbi;
 }
@@ -88,12 +86,12 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
  int i;

  vp9_remove_common(cm);
-  vp9_get_worker_interface()->end(&pbi->lf_worker);
+  vp9_worker_end(&pbi->lf_worker);
  vpx_free(pbi->lf_worker.data1);
  vpx_free(pbi->tile_data);
  for (i = 0; i < pbi->num_tile_workers; ++i) {
    VP9Worker *const worker = &pbi->tile_workers[i];
-    vp9_get_worker_interface()->end(worker);
+    vp9_worker_end(worker);
    vpx_free(worker->data1);
    vpx_free(worker->data2);
  }
@@ -126,7 +124,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
   */
  if (ref_frame_flag == VP9_LAST_FLAG) {
    const YV12_BUFFER_CONFIG *const cfg =
-        &cm->buffer_pool->frame_bufs[cm->ref_frame_map[0]].buf;
+        &cm->frame_bufs[cm->ref_frame_map[0]].buf;
    if (!equal_dimensions(cfg, sd))
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Incorrect buffer dimensions");
@@ -145,7 +143,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                      VP9_REFFRAME ref_frame_flag,
                                      YV12_BUFFER_CONFIG *sd) {
  RefBuffer *ref_buf = NULL;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
  // encoder is using the frame buffers for. This is just a stub to keep the
@@ -173,11 +170,11 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
    const int free_fb = get_free_fb(cm);
    // Decrease ref_count since it will be increased again in
    // ref_cnt_fb() below.
-    --frame_bufs[free_fb].ref_count;
+    cm->frame_bufs[free_fb].ref_count--;

    // Manage the reference counters and copy image.
-    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
+    ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf;
    vp8_yv12_copy_frame(sd, ref_buf->buf);
  }

@@ -187,44 +184,35 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,

 int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
  VP9_COMMON *cm = &pbi->common;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  if (index < 0 || index >= REF_FRAMES)
    return -1;

-  *fb = &frame_bufs[cm->ref_frame_map[index]].buf;
+  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
  return 0;
 }

 /* If any buffer updating is signaled it should be done here. */
 static void swap_frame_buffers(VP9Decoder *pbi) {
  int ref_index = 0, mask;
-  VP9_COMMON * const cm = &pbi->common;
-  BufferPool * const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  VP9_COMMON *const cm = &pbi->common;

  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
    if (mask & 1) {
      const int old_idx = cm->ref_frame_map[ref_index];
-#if CONFIG_MULTITHREAD
-      pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
-#endif
-      ref_cnt_fb(frame_bufs, &cm->ref_frame_map[ref_index],
+      ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
                 cm->new_fb_idx);
-      if (old_idx >= 0 && frame_bufs[old_idx].ref_count == 0)
-        pool->release_fb_cb(pool->cb_priv,
-                            &frame_bufs[old_idx].raw_frame_buffer);
+      if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
+        cm->release_fb_cb(cm->cb_priv,
+                          &cm->frame_bufs[old_idx].raw_frame_buffer);
    }
-#if CONFIG_MULTITHREAD
-      pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
-#endif
    ++ref_index;
  }

  cm->frame_to_show = get_frame_new_buffer(cm);

  if (!pbi->frame_parallel_decode || !cm->show_frame) {
-    --frame_bufs[cm->new_fb_idx].ref_count;
+    --cm->frame_bufs[cm->new_fb_idx].ref_count;
  }

  // Invalidate these references until the next frame starts.
@@ -235,8 +223,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
 int vp9_receive_compressed_data(VP9Decoder *pbi,
                                size_t size, const uint8_t **psource) {
  VP9_COMMON *const cm = &pbi->common;
-  BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  const uint8_t *source = *psource;
  int retcode = 0;

@@ -258,22 +244,11 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
  // Check if the previous frame was a frame without any references to it.
  // Release frame buffer if not decoding in frame parallel mode.
  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
-      && frame_bufs[cm->new_fb_idx].ref_count == 0)
-    pool->release_fb_cb(pool->cb_priv,
-                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+      && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
+    cm->release_fb_cb(cm->cb_priv,
+                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
  cm->new_fb_idx = get_free_fb(cm);

-  if (pbi->frame_parallel_decode) {
-    VP9Worker *worker = pbi->owner_frame_worker;
-    FrameWorkerData *const worker_data = worker->data1;
-    pbi->cur_buf = &pool->frame_bufs[cm->new_fb_idx];
-    pool->frame_bufs[cm->new_fb_idx].owner_worker_id = worker_data->worker_id;
-
-    // Reset the decoding progress.
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-  }
-
  if (setjmp(cm->error.jmp)) {
    cm->error.setjmp = 0;

@@ -287,8 +262,8 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
    if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
      cm->frame_refs[0].buf->corrupted = 1;

-    if (frame_bufs[cm->new_fb_idx].ref_count > 0)
-      --frame_bufs[cm->new_fb_idx].ref_count;
+    if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
+      cm->frame_bufs[cm->new_fb_idx].ref_count--;

    return -1;
  }
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -45,14 +45,8 @@ typedef struct VP9Decoder {

  int frame_parallel_decode;  // frame-based threading.

-  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
-  // the same.
-  RefCntBuffer *cur_buf;  //  current decoding reference buffer.
-
  VP9Worker lf_worker;
  VP9Worker *tile_workers;
-  VP9Worker *owner_frame_worker;   // frame_worker that owns this pbi;
-
  int num_tile_workers;

  TileData *tile_data;
@@ -84,7 +78,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
 int vp9_get_reference_dec(struct VP9Decoder *pbi,
                          int index, YV12_BUFFER_CONFIG **fb);

-struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
+struct VP9Decoder *vp9_decoder_create();

 void vp9_decoder_remove(struct VP9Decoder *pbi);

--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -124,7 +124,7 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
 static int loop_filter_row_worker(void *arg1, void *arg2) {
  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
  LFWorkerData *const lf_data = &tile_data->lfdata;
-  (void) arg2;
+
  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                      lf_data->start, lf_data->stop, lf_data->y_only,
                      lf_data->lf_sync, lf_data->num_lf_workers);
@@ -138,7 +138,6 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                              int frame_filter_level,
                              int y_only) {
  VP9LfSync *const lf_sync = &pbi->lf_row_sync;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  // Number of superblock rows and cols
  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -198,15 +197,15 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

    // Start loopfiltering
    if (i == num_workers - 1) {
-      winterface->execute(worker);
+      vp9_worker_execute(worker);
    } else {
-      winterface->launch(worker);
+      vp9_worker_launch(worker);
    }
  }

  // Wait till all rows are finished
  for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&pbi->tile_workers[i]);
+    vp9_worker_sync(&pbi->tile_workers[i]);
  }
 }

@@ -279,78 +278,3 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
    vp9_zero(*lf_sync);
  }
 }
-
-void vp9_frameworker_wait(VP9Worker* const worker, int row, int col,
-                          RefCntBuffer *ref_buf) {
-  FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-  const VP9Decoder *const pbi = worker_data->pbi;
-  const RefCntBuffer *const cur_buf = pbi->cur_buf;
-
-  // Check if worker already release the ref_buf.
-  if (!worker || ref_buf->owner_worker_id == -1) return;
-
-  pthread_mutex_lock(&worker_data->stats_mutex);
-  while (!(cur_buf->row >= row && cur_buf->col >= col)
-         && pbi->cur_buf == ref_buf && ref_buf->owner_worker_id != -1) {
-    pthread_cond_wait(&worker_data->stats_cond, &worker_data->stats_mutex);
-  }
-  pthread_mutex_unlock(&worker_data->stats_mutex);
-}
-
-void vp9_frameworker_broadcast(VP9Worker* const worker, int row, int col) {
-  FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-  const VP9Decoder *const pbi = worker_data->pbi;
-  RefCntBuffer *const cur_buf = pbi->cur_buf;
-
-  pthread_mutex_lock(&worker_data->stats_mutex);
-  cur_buf->row = row;
-  cur_buf->col = col;
-  pthread_cond_signal(&worker_data->stats_cond);
-  pthread_mutex_unlock(&worker_data->stats_mutex);
-}
-
-void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
-                                  const VP9Worker *const src_worker) {
-  FrameWorkerData *const src_worker_data =
-      (FrameWorkerData *)dst_worker->data1;
-  FrameWorkerData *const dst_worker_data =
-      (FrameWorkerData *)src_worker->data1;
-  const VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
-  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
-  int i;
-
-  // Wait until source frame's context is ready.
-  pthread_mutex_lock(&src_worker_data->stats_mutex);
-  while (!src_worker_data->frame_context_ready) {
-    pthread_cond_wait(&src_worker_data->stats_cond,
-                      &src_worker_data->stats_mutex);
-  }
-  pthread_mutex_unlock(&src_worker_data->stats_mutex);
-
-  dst_cm->last_width = src_cm->width;
-  dst_cm->last_height = src_cm->height;
-  dst_cm->subsampling_x = src_cm->subsampling_x;
-  dst_cm->subsampling_y = src_cm->subsampling_y;
-
-  for (i = 0; i < REF_FRAMES; ++i)
-    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
-
-  dst_cm->last_show_frame = src_cm->show_frame;
-
-  dst_cm->prev_mip = src_cm->mip;
-  dst_cm->prev_mi = src_cm->mi;
-  dst_cm->prev_mi_grid_base = src_cm->mi_grid_base;
-  dst_cm->prev_mi_grid_visible = src_cm->mi_grid_visible;
-  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
-
-  for (i = 0; i < MAX_REF_LF_DELTAS; ++i) {
-    dst_cm->lf.last_ref_deltas[i] = src_cm->lf.ref_deltas[i];
-    dst_cm->lf.ref_deltas[i] = src_cm->lf.ref_deltas[i];
-  }
-
-  for (i = 0; i < MAX_MODE_LF_DELTAS; ++i)
-    dst_cm-> lf.last_mode_deltas[i] = src_cm->lf.mode_deltas[i];
-
-  for (i = 0; i < FRAME_CONTEXTS; ++i)
-    dst_cm-> frame_contexts[i] = src_cm->frame_contexts[i];
-}
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -55,11 +55,6 @@ typedef struct FrameWorkerData {
  // It is used to make a copy of the compressed data.
  uint8_t *scratch_buffer;
  size_t scratch_buffer_size;
-
-  pthread_mutex_t stats_mutex;
-  pthread_cond_t stats_cond;
-
-  int frame_context_ready;  // Current frame's context is ready to read.
 } FrameWorkerData;

 // Allocate memory for loopfilter row synchronization.
@@ -76,19 +71,4 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                              int frame_filter_level,
                              int y_only);

-// Wait for FrameWorker to finish decoding ref_buf to (r,c) position.
-// Note: worker may already finish decoding ref_buf and release it in order to
-// start decoding next frame. So need to check whether worker is still decoding
-// ref_buf.
-void vp9_frameworker_wait(VP9Worker* const worker, int row, int col,
-                          RefCntBuffer *ref_buf);
-
-// FrameWorker broadcasts its decoding progress so other workers that are
-// waiting it could resume decoding.
-void vp9_frameworker_broadcast(VP9Worker* const worker, int row, int col);
-
-// Copy necessary decoding context from src worker to dst worker.
-void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
-                                 const VP9Worker *const src_worker);
-
 #endif  // VP9_DECODER_VP9_DTHREAD_H_
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/decoder/vp9_thread.c
@@ -11,79 +11,71 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob 08ad4e1fecba302bf1247645e84a7d2779956bc3  src/utils/thread.c
+//  100644 blob eff8f2a8c20095aade3c292b0e9292dac6cb3587  src/utils/thread.c
+

 #include <assert.h>
 #include <string.h>   // for memset()
 #include "./vp9_thread.h"
-#include "vpx_mem/vpx_mem.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

 #if CONFIG_MULTITHREAD

-struct VP9WorkerImpl {
-  pthread_mutex_t mutex_;
-  pthread_cond_t  condition_;
-  pthread_t       thread_;
-};
-
 //------------------------------------------------------------------------------

-static void execute(VP9Worker *const worker);  // Forward declaration.
-
-static THREADFN thread_loop(void *ptr) {
-  VP9Worker *const worker = (VP9Worker*)ptr;
+static THREADFN thread_loop(void *ptr) {    // thread loop
+  VP9Worker* const worker = (VP9Worker*)ptr;
  int done = 0;
  while (!done) {
-    pthread_mutex_lock(&worker->impl_->mutex_);
+    pthread_mutex_lock(&worker->mutex_);
    while (worker->status_ == OK) {   // wait in idling mode
-      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+      pthread_cond_wait(&worker->condition_, &worker->mutex_);
    }
    if (worker->status_ == WORK) {
-      execute(worker);
+      vp9_worker_execute(worker);
      worker->status_ = OK;
    } else if (worker->status_ == NOT_OK) {   // finish the worker
      done = 1;
    }
-    // signal to the main thread that we're done (for sync())
-    pthread_cond_signal(&worker->impl_->condition_);
-    pthread_mutex_unlock(&worker->impl_->mutex_);
+    // signal to the main thread that we're done (for Sync())
+    pthread_cond_signal(&worker->condition_);
+    pthread_mutex_unlock(&worker->mutex_);
  }
  return THREAD_RETURN(NULL);    // Thread is finished
 }

 // main thread state control
-static void change_state(VP9Worker *const worker,
+static void change_state(VP9Worker* const worker,
                         VP9WorkerStatus new_status) {
-  // No-op when attempting to change state on a thread that didn't come up.
-  // Checking status_ without acquiring the lock first would result in a data
-  // race.
-  if (worker->impl_ == NULL) return;
+  // no-op when attempting to change state on a thread that didn't come up
+  if (worker->status_ < OK) return;

-  pthread_mutex_lock(&worker->impl_->mutex_);
-  if (worker->status_ >= OK) {
-    // wait for the worker to finish
-    while (worker->status_ != OK) {
-      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
-    }
-    // assign new status and release the working thread if needed
-    if (new_status != OK) {
-      worker->status_ = new_status;
-      pthread_cond_signal(&worker->impl_->condition_);
-    }
+  pthread_mutex_lock(&worker->mutex_);
+  // wait for the worker to finish
+  while (worker->status_ != OK) {
+    pthread_cond_wait(&worker->condition_, &worker->mutex_);
  }
-  pthread_mutex_unlock(&worker->impl_->mutex_);
+  // assign new status and release the working thread if needed
+  if (new_status != OK) {
+    worker->status_ = new_status;
+    pthread_cond_signal(&worker->condition_);
+  }
+  pthread_mutex_unlock(&worker->mutex_);
 }

 #endif  // CONFIG_MULTITHREAD

 //------------------------------------------------------------------------------

-static void init(VP9Worker *const worker) {
+void vp9_worker_init(VP9Worker* const worker) {
  memset(worker, 0, sizeof(*worker));
  worker->status_ = NOT_OK;
 }

-static int sync(VP9Worker *const worker) {
+int vp9_worker_sync(VP9Worker* const worker) {
 #if CONFIG_MULTITHREAD
  change_state(worker, OK);
 #endif
@@ -91,93 +83,59 @@ static int sync(VP9Worker *const worker) {
  return !worker->had_error;
 }

-static int reset(VP9Worker *const worker) {
+int vp9_worker_reset(VP9Worker* const worker) {
  int ok = 1;
  worker->had_error = 0;
  if (worker->status_ < OK) {
 #if CONFIG_MULTITHREAD
-    worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
-    if (worker->impl_ == NULL) {
+    if (pthread_mutex_init(&worker->mutex_, NULL) ||
+        pthread_cond_init(&worker->condition_, NULL)) {
      return 0;
    }
-    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
-      goto Error;
-    }
-    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
-      pthread_mutex_destroy(&worker->impl_->mutex_);
-      goto Error;
-    }
-    pthread_mutex_lock(&worker->impl_->mutex_);
-    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+    pthread_mutex_lock(&worker->mutex_);
+    ok = !pthread_create(&worker->thread_, NULL, thread_loop, worker);
    if (ok) worker->status_ = OK;
-    pthread_mutex_unlock(&worker->impl_->mutex_);
-    if (!ok) {
-      pthread_mutex_destroy(&worker->impl_->mutex_);
-      pthread_cond_destroy(&worker->impl_->condition_);
- Error:
-      vpx_free(worker->impl_);
-      worker->impl_ = NULL;
-      return 0;
-    }
+    pthread_mutex_unlock(&worker->mutex_);
 #else
    worker->status_ = OK;
 #endif
  } else if (worker->status_ > OK) {
-    ok = sync(worker);
+    ok = vp9_worker_sync(worker);
  }
  assert(!ok || (worker->status_ == OK));
  return ok;
 }

-static void execute(VP9Worker *const worker) {
+void vp9_worker_execute(VP9Worker* const worker) {
  if (worker->hook != NULL) {
    worker->had_error |= !worker->hook(worker->data1, worker->data2);
  }
 }

-static void launch(VP9Worker *const worker) {
+void vp9_worker_launch(VP9Worker* const worker) {
 #if CONFIG_MULTITHREAD
  change_state(worker, WORK);
 #else
-  execute(worker);
+  vp9_worker_execute(worker);
 #endif
 }

-static void end(VP9Worker *const worker) {
+void vp9_worker_end(VP9Worker* const worker) {
  if (worker->status_ >= OK) {
 #if CONFIG_MULTITHREAD
    change_state(worker, NOT_OK);
-    pthread_join(worker->impl_->thread_, NULL);
-    pthread_mutex_destroy(&worker->impl_->mutex_);
-    pthread_cond_destroy(&worker->impl_->condition_);
+    pthread_join(worker->thread_, NULL);
+    pthread_mutex_destroy(&worker->mutex_);
+    pthread_cond_destroy(&worker->condition_);
 #else
    worker->status_ = NOT_OK;
 #endif
  }
-  vpx_free(worker->impl_);
-  worker->impl_ = NULL;
  assert(worker->status_ == NOT_OK);
 }

 //------------------------------------------------------------------------------

-static VP9WorkerInterface g_worker_interface = {
-  init, reset, sync, launch, execute, end
-};
-
-int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) {
-  if (winterface == NULL ||
-      winterface->init == NULL || winterface->reset == NULL ||
-      winterface->sync == NULL || winterface->launch == NULL ||
-      winterface->execute == NULL || winterface->end == NULL) {
-    return 0;
-  }
-  g_worker_interface = *winterface;
-  return 1;
-}
-
-const VP9WorkerInterface *vp9_get_worker_interface(void) {
-  return &g_worker_interface;
-}
-
-//------------------------------------------------------------------------------
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/vp9/decoder/vp9_thread.h
+++ b/vp9/decoder/vp9_thread.h
@@ -11,7 +11,8 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38  src/utils/thread.h
+//  100644 blob 13a61a4c84194c3374080cbf03d881d3cd6af40d  src/utils/thread.h
+

 #ifndef VP9_DECODER_VP9_THREAD_H_
 #define VP9_DECODER_VP9_THREAD_H_
@@ -162,53 +163,40 @@ typedef enum {
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*VP9WorkerHook)(void*, void*);

-// Platform-dependent implementation details for the worker.
-typedef struct VP9WorkerImpl VP9WorkerImpl;
-
-// Synchronization object used to launch job in the worker thread
+// Synchronize object used to launch job in the worker thread
 typedef struct {
-  VP9WorkerImpl *impl_;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+#endif
  VP9WorkerStatus status_;
  VP9WorkerHook hook;     // hook to call
-  void *data1;            // first argument passed to 'hook'
-  void *data2;            // second argument passed to 'hook'
+  void* data1;            // first argument passed to 'hook'
+  void* data2;            // second argument passed to 'hook'
  int had_error;          // return value of the last call to 'hook'
 } VP9Worker;

-// The interface for all thread-worker related functions. All these functions
-// must be implemented.
-typedef struct {
-  // Must be called first, before any other method.
-  void (*init)(VP9Worker *const worker);
-  // Must be called to initialize the object and spawn the thread. Re-entrant.
-  // Will potentially launch the thread. Returns false in case of error.
-  int (*reset)(VP9Worker *const worker);
-  // Makes sure the previous work is finished. Returns true if worker->had_error
-  // was not set and no error condition was triggered by the working thread.
-  int (*sync)(VP9Worker *const worker);
-  // Triggers the thread to call hook() with data1 and data2 arguments. These
-  // hook/data1/data2 values can be changed at any time before calling this
-  // function, but not be changed afterward until the next call to Sync().
-  void (*launch)(VP9Worker *const worker);
-  // This function is similar to launch() except that it calls the
-  // hook directly instead of using a thread. Convenient to bypass the thread
-  // mechanism while still using the VP9Worker structs. sync() must
-  // still be called afterward (for error reporting).
-  void (*execute)(VP9Worker *const worker);
-  // Kill the thread and terminate the object. To use the object again, one
-  // must call reset() again.
-  void (*end)(VP9Worker *const worker);
-} VP9WorkerInterface;
-
-// Install a new set of threading functions, overriding the defaults. This
-// should be done before any workers are started, i.e., before any encoding or
-// decoding takes place. The contents of the interface struct are copied, it
-// is safe to free the corresponding memory after this call. This function is
-// not thread-safe. Return false in case of invalid pointer or methods.
-int vp9_set_worker_interface(const VP9WorkerInterface *const winterface);
-
-// Retrieve the currently set thread worker interface.
-const VP9WorkerInterface *vp9_get_worker_interface(void);
+// Must be called first, before any other method.
+void vp9_worker_init(VP9Worker* const worker);
+// Must be called to initialize the object and spawn the thread. Re-entrant.
+// Will potentially launch the thread. Returns false in case of error.
+int vp9_worker_reset(VP9Worker* const worker);
+// Makes sure the previous work is finished. Returns true if worker->had_error
+// was not set and no error condition was triggered by the working thread.
+int vp9_worker_sync(VP9Worker* const worker);
+// Triggers the thread to call hook() with data1 and data2 argument. These
+// hook/data1/data2 can be changed at any time before calling this function,
+// but not be changed afterward until the next call to vp9_worker_sync().
+void vp9_worker_launch(VP9Worker* const worker);
+// This function is similar to vp9_worker_launch() except that it calls the
+// hook directly instead of using a thread. Convenient to bypass the thread
+// mechanism while still using the VP9Worker structs. vp9_worker_sync() must
+// still be called afterward (for error reporting).
+void vp9_worker_execute(VP9Worker* const worker);
+// Kill the thread and terminate the object. To use the object again, one
+// must call vp9_worker_reset() again.
+void vp9_worker_end(VP9Worker* const worker);

 //------------------------------------------------------------------------------

--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -15,19 +15,8 @@

 #include "vp9/encoder/vp9_segmentation.h"

-#define AQ_C_SEGMENTS  3
-#define AQ_C_STRENGTHS  3
-static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3};
-static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
-  {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
-static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
-  {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
-
-static int get_aq_c_strength(int q_index) {
-  // Approximate base quatizer (truncated to int)
-  int base_quant = vp9_ac_quant(q_index, 0) / 4;
-  return (base_quant > 20) + (base_quant > 45);
-}
+static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
+  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};

 void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
@@ -40,8 +29,6 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
      cpi->refresh_alt_ref_frame ||
      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
    int segment;
-    const int aq_strength = get_aq_c_strength(cm->base_qindex);
-    const int active_segments = aq_c_active_segments[aq_strength];

    // Clear down the segment map.
    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
@@ -49,16 +36,8 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
    // Clear down the complexity map used for rd.
    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);

-    vp9_clearall_segfeatures(seg);
-
-    // Segmentation only makes sense if the target bits per SB is above a
-    // threshold. Below this the overheads will usually outweigh any benefit.
-    if (cpi->rc.sb64_target_rate < 256) {
-      vp9_disable_segmentation(seg);
-      return;
-    }
-
    vp9_enable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);

    // Select delta coding method.
    seg->abs_delta = SEGMENT_DELTADATA;
@@ -67,14 +46,14 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);

    // Use some of the segments for in frame Q adjustment.
-    for (segment = 1; segment < active_segments; ++segment) {
+    for (segment = 1; segment < 2; segment++) {
      int qindex_delta =
          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     aq_c_q_adj_factor[aq_strength][segment]);
+                                     in_frame_q_adj_ratio[segment]);

-      // For AQ complexity mode, we dont allow Q0 in a segment if the base
-      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
-      // Q delta is sometimes applied without going back around the rd loop.
+      // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
+      // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
+      // is sometimes applied without going back around the rd loop.
      // This could lead to an illegal combination of partition size and q.
      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
        qindex_delta = -cm->base_qindex + 1;
@@ -87,15 +66,10 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
  }
 }

-// Select a segment for the current SB64 block.
-// The choice of segment for a block depends on the ratio of the projected
-// bits for the block vs a target average.
-// An "aq_strength" value determines how many segments are supported,
-// the set of transition points to use and the extent of the quantizer
-// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
+// Select a segment for the current SB64
 void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
-                                   int mi_row, int mi_col,
-                                   int output_enabled, int projected_rate) {
+                                      int mi_row, int mi_col,
+                                      int output_enabled, int projected_rate) {
  VP9_COMMON *const cm = &cpi->common;

  const int mi_offset = mi_row * cm->mi_cols + mi_col;
@@ -115,22 +89,11 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
    // It is converted to bits * 256 units.
    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
                            (bw * bh);
-    const int aq_strength = get_aq_c_strength(cm->base_qindex);
-    const int active_segments = aq_c_active_segments[aq_strength];

-    // The number of segments considered and the transition points used to
-    // select them is determined by the "aq_strength" value.
-    // Currently this loop only supports segments that reduce Q (i.e. where
-    // there is undershoot.
-    // The loop counts down towards segment 0 which is the default segment
-    // with no Q adjustment.
-    segment = active_segments - 1;
-    while (segment > 0) {
-      if (projected_rate <
-          (target_rate * aq_c_transitions[aq_strength][segment])) {
-        break;
-      }
-      --segment;
+    if (projected_rate < (target_rate / 4)) {
+      segment = 1;
+    } else {
+      segment = 0;
    }

    if (target_rate > 0) {
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -38,12 +38,32 @@ static struct vp9_token intra_mode_encodings[INTRA_MODES];
 static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
 static struct vp9_token partition_encodings[PARTITION_TYPES];
 static struct vp9_token inter_mode_encodings[INTER_MODES];
+#if CONFIG_COPY_CODING
+static struct vp9_token copy_mode_encodings_l2[2];
+static struct vp9_token copy_mode_encodings[COPY_MODE_COUNT - 1];
+#endif
+
+#if CONFIG_SUPERTX
+static int vp9_check_supertx(VP9_COMMON *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize) {
+  MODE_INFO **mi;
+
+  mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+
+  return mi[0]->mbmi.tx_size == bsize_to_tx_size(bsize) &&
+         mi[0]->mbmi.sb_type < bsize;
+}
+#endif

 void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
+#if CONFIG_COPY_CODING
+  vp9_tokens_from_tree(copy_mode_encodings_l2, vp9_copy_mode_tree_l2);
+  vp9_tokens_from_tree(copy_mode_encodings, vp9_copy_mode_tree);
+#endif
 }

 static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
@@ -58,6 +78,21 @@ static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode,
                  &inter_mode_encodings[INTER_OFFSET(mode)]);
 }

+#if CONFIG_COPY_CODING
+static void write_copy_mode(VP9_COMMON *cm, vp9_writer *w, COPY_MODE mode,
+                            int inter_ref_count, int copy_mode_context) {
+  if (inter_ref_count == 2) {
+    vp9_write_token(w, vp9_copy_mode_tree_l2,
+                    cm->fc.copy_mode_probs_l2[copy_mode_context],
+                    &copy_mode_encodings_l2[mode - REF0]);
+  } else if (inter_ref_count > 2) {
+    vp9_write_token(w, vp9_copy_mode_tree,
+                    cm->fc.copy_mode_probs[copy_mode_context],
+                    &copy_mode_encodings[mode - REF0]);
+  }
+}
+#endif
+
 static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
                                int data, int max) {
  vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
@@ -225,6 +260,9 @@ static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
 }

 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
+#if CONFIG_SUPERTX
+                                int supertx_enabled,
+#endif
                                vp9_writer *w) {
  VP9_COMMON *const cm = &cpi->common;
  const nmv_context *nmvc = &cm->fc.nmvc;
@@ -239,7 +277,19 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
  const int is_inter = is_inter_block(mbmi);
  const int is_compound = has_second_ref(mbmi);
  int skip, ref;
+#if CONFIG_COPY_CODING
+  int copy_mode_context = vp9_get_copy_mode_context(xd);
+#endif

+#if CONFIG_COPY_CODING
+  if (bsize >= BLOCK_8X8 && mbmi->inter_ref_count > 0) {
+      vp9_write(w, mbmi->copy_mode != NOREF,
+                cm->fc.copy_noref_prob[copy_mode_context][bsize]);
+      if (mbmi->copy_mode != NOREF)
+        write_copy_mode(cm, w, mbmi->copy_mode, mbmi->inter_ref_count,
+                        copy_mode_context);
+  }
+#endif
  if (seg->update_map) {
    if (seg->temporal_update) {
      const int pred_flag = mbmi->seg_id_predicted;
@@ -252,20 +302,57 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
    }
  }

+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif
  skip = write_skip(cpi, segment_id, mi, w);
+#if CONFIG_SUPERTX
+  else
+    skip = mbmi->skip;
+#endif

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
+#if CONFIG_COPY_CODING
+  if (mbmi->copy_mode == NOREF)
+#endif
  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+#if CONFIG_SUPERTX
+  }
+#endif

  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
      !(is_inter &&
        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
  }
+#if CONFIG_EXT_TX
+    if (is_inter &&
+        mbmi->tx_size <= TX_16X16 &&
+        bsize >= BLOCK_8X8 &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif
+        !mbmi->skip &&
+        !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      vp9_write(w, mbmi->ext_txfrm, cm->fc.ext_tx_prob);
+    }
+#endif

  if (!is_inter) {
    if (bsize >= BLOCK_8X8) {
      write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+#if CONFIG_FILTERINTRA
+      if (is_filter_allowed(mode) && is_filter_enabled(mbmi->tx_size)) {
+        vp9_write(w, mbmi->filterbit,
+                  cm->fc.filterintra_prob[mbmi->tx_size][mode]);
+      }
+#endif
    } else {
      int idx, idy;
      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -274,11 +361,28 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
        for (idx = 0; idx < 2; idx += num_4x4_w) {
          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
          write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
+#if CONFIG_FILTERINTRA
+          if (is_filter_allowed(b_mode)) {
+            vp9_write(w, mi->b_filter_info[idy * 2 + idx],
+                      cm->fc.filterintra_prob[0][b_mode]);
+          }
+#endif
        }
      }
    }
    write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+#if CONFIG_FILTERINTRA
+    if (is_filter_allowed(mbmi->uv_mode) &&
+        is_filter_enabled(get_uv_tx_size(mbmi))) {
+      vp9_write(w, mbmi->uv_filterbit,
+                cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+    }
+#endif
+#if !CONFIG_COPY_CODING
  } else {
+#else
+  } else if (mbmi->copy_mode == NOREF) {
+#endif
    const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
    const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
    write_ref_frames(cpi, w);
@@ -300,6 +404,32 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
      assert(mbmi->interp_filter == cm->interp_filter);
    }

+#if CONFIG_INTERINTRA
+    if ((cm->use_interintra) &&
+        cpi->common.reference_mode != COMPOUND_REFERENCE &&
+        is_interintra_allowed(bsize) &&
+        is_inter_mode(mode) &&
+        (mbmi->ref_frame[1] <= INTRA_FRAME)) {
+        vp9_write(w, mbmi->ref_frame[1] == INTRA_FRAME,
+                  cm->fc.interintra_prob[bsize]);
+        if (mbmi->ref_frame[1] == INTRA_FRAME) {
+          write_intra_mode(w, mbmi->interintra_mode,
+                           cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+#if CONFIG_MASKED_INTERINTRA
+        if (get_mask_bits_interintra(bsize) &&
+            cm->use_masked_interintra) {
+          vp9_write(w, mbmi->use_masked_interintra,
+                    cm->fc.masked_interintra_prob[bsize]);
+          if (mbmi->use_masked_interintra) {
+            vp9_write_literal(w, mbmi->interintra_mask_index,
+                              get_mask_bits_interintra(bsize));
+          }
+        }
+#endif
+      }
+    }
+#endif
+
    if (bsize < BLOCK_8X8) {
      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -326,6 +456,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                        allow_hp);
      }
    }
+#if CONFIG_MASKED_INTERINTER
+  if (cm->use_masked_interinter &&
+      cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_mode(mode) &&
+      get_mask_bits(mbmi->sb_type) &&
+      mbmi->ref_frame[1] > INTRA_FRAME) {
+    vp9_write(w, mbmi->use_masked_interinter,
+              cm->fc.masked_interinter_prob[bsize]);
+    if (mbmi->use_masked_interinter)
+      vp9_write_literal(w, mbmi->mask_index, get_mask_bits(mbmi->sb_type));
+  }
+#endif
  }
 }

@@ -350,6 +492,11 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,

  if (bsize >= BLOCK_8X8) {
    write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
+#if CONFIG_FILTERINTRA
+    if (is_filter_allowed(mbmi->mode) && is_filter_enabled(mbmi->tx_size))
+      vp9_write(w, mbmi->filterbit,
+                cm->fc.filterintra_prob[mbmi->tx_size][mbmi->mode]);
+#endif
  } else {
    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -360,15 +507,29 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
        const int block = idy * 2 + idx;
        write_intra_mode(w, mi->bmi[block].as_mode,
                         get_y_mode_probs(mi, above_mi, left_mi, block));
+#if CONFIG_FILTERINTRA
+        if (is_filter_allowed(mi->bmi[block].as_mode))
+          vp9_write(w, mi->b_filter_info[block],
+                    cm->fc.filterintra_prob[0][mi->bmi[block].as_mode]);
+#endif
      }
    }
  }

  write_intra_mode(w, mbmi->uv_mode, vp9_kf_uv_mode_prob[mbmi->mode]);
+#if CONFIG_FILTERINTRA
+  if (is_filter_allowed(mbmi->uv_mode) &&
+      is_filter_enabled(get_uv_tx_size(mbmi)))
+    vp9_write(w, mbmi->uv_filterbit,
+              cm->fc.filterintra_prob[get_uv_tx_size(mbmi)][mbmi->uv_mode]);
+#endif
 }

 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
                          vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+#if CONFIG_SUPERTX
+                          int supertx_enabled,
+#endif
                          int mi_row, int mi_col) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -384,11 +545,21 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
  if (frame_is_intra_only(cm)) {
    write_mb_modes_kf(cpi, xd->mi, w);
  } else {
+#if CONFIG_SUPERTX
+    pack_inter_mode_mvs(cpi, m, supertx_enabled, w);
+#else
    pack_inter_mode_mvs(cpi, m, w);
+#endif
  }

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  assert(*tok < tok_end);
  pack_mb_tokens(w, tok, tok_end);
+#if CONFIG_SUPERTX
+  }
+#endif
 }

 static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -415,6 +586,9 @@ static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
 static void write_modes_sb(VP9_COMP *cpi,
                           const TileInfo *const tile,
                           vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+#if CONFIG_SUPERTX
+                           int pack_token, int supertx_enabled,
+#endif
                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -431,36 +605,105 @@ static void write_modes_sb(VP9_COMP *cpi,
  partition = partition_lookup[bsl][m->mbmi.sb_type];
  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
  subsize = get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  set_mi_row_col(xd, tile,
+                 mi_row, num_8x8_blocks_high_lookup[bsize],
+                 mi_col, num_8x8_blocks_wide_lookup[bsize],
+                 cm->mi_rows, cm->mi_cols);
+  if (!supertx_enabled && cm->frame_type != KEY_FRAME &&
+      partition != PARTITION_NONE && bsize <= BLOCK_32X32) {
+    TX_SIZE supertx_size = bsize_to_tx_size(bsize);  // b_width_log2(bsize);
+    vp9_prob prob = partition == PARTITION_SPLIT ?
+                    cm->fc.supertxsplit_prob[supertx_size] :
+                    cm->fc.supertx_prob[supertx_size];
+    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+    vp9_write(w, supertx_enabled, prob);
+    if (supertx_enabled) {
+      vp9_write(w, xd->mi[0]->mbmi.skip, vp9_get_skip_prob(cm, xd));
+#if CONFIG_EXT_TX
+      if (supertx_size <= TX_16X16 && !xd->mi[0]->mbmi.skip)
+        vp9_write(w, xd->mi[0]->mbmi.ext_txfrm, cm->fc.ext_tx_prob);
+#endif
+    }
+  }
+#endif
  if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                  supertx_enabled,
+#endif
+                  mi_row, mi_col);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        break;
      case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        mi_row + bs, mi_col);
        break;
      case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        mi_row, mi_col + bs);
        break;
      case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row, mi_col + bs,
                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row + bs, mi_col,
                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row + bs, mi_col + bs,
                       subsize);
        break;
      default:
        assert(0);
    }
  }
+#if CONFIG_SUPERTX
+  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
+    assert(*tok < tok_end);
+    pack_mb_tokens(w, tok, tok_end);
+  }
+#endif

  // update partition context
  if (bsize >= BLOCK_8X8 &&
@@ -478,7 +721,11 @@ static void write_modes(VP9_COMP *cpi,
    vp9_zero(cpi->mb.e_mbd.left_seg_context);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+      write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                     1, 0,
+#endif
+                     mi_row, mi_col,
                     BLOCK_64X64);
  }
 }
@@ -1176,6 +1423,104 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
                       cm->counts.partition[i], PARTITION_TYPES, &header_bc);

    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc);
+
+#if CONFIG_EXT_TX
+    vp9_cond_prob_diff_update(&header_bc, &fc->ext_tx_prob, cm->counts.ext_tx);
+#endif
+
+#if CONFIG_MASKED_INTERINTER
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      if (!cpi->dummy_packing && cm->use_masked_interinter) {
+        cm->use_masked_interinter = 0;
+        for (i = 0; i < BLOCK_SIZES; i++)
+          if (get_mask_bits(i) && (cm->counts.masked_interinter[i][1] > 0)) {
+            cm->use_masked_interinter = 1;
+            break;
+          }
+      }
+      vp9_write_bit(&header_bc, cm->use_masked_interinter);
+      if (cm->use_masked_interinter) {
+        for (i = 0; i < BLOCK_SIZES; i++)
+          if (get_mask_bits(i))
+            vp9_cond_prob_diff_update(&header_bc,
+                                      &fc->masked_interinter_prob[i],
+                                      cm->counts.masked_interinter[i]);
+      } else {
+        vp9_zero(cm->counts.masked_interinter);
+      }
+    } else {
+      if (!cpi->dummy_packing)
+        cm->use_masked_interinter = 0;
+      vp9_zero(cm->counts.masked_interinter);
+    }
+#endif
+
+#if CONFIG_INTERINTRA
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      if (!cpi->dummy_packing && cm->use_interintra) {
+        cm->use_interintra = 0;
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (is_interintra_allowed(i) && (cm->counts.interintra[i][1] > 0)) {
+            cm->use_interintra = 1;
+            break;
+          }
+        }
+      }
+      vp9_write_bit(&header_bc, cm->use_interintra);
+      if (cm->use_interintra) {
+        for (i = 0; i < BLOCK_SIZES; i++) {
+          if (is_interintra_allowed(i)) {
+            vp9_cond_prob_diff_update(&header_bc,
+                                      &fc->interintra_prob[i],
+                                      cm->counts.interintra[i]);
+          }
+        }
+#if CONFIG_MASKED_INTERINTRA
+        if (!cpi->dummy_packing && cm->use_masked_interintra) {
+          cm->use_masked_interintra = 0;
+          for (i = 0; i < BLOCK_SIZES; i++) {
+            if (is_interintra_allowed(i) && get_mask_bits_interintra(i) &&
+                (cm->counts.masked_interintra[i][1] > 0)) {
+              cm->use_masked_interintra = 1;
+              break;
+            }
+          }
+        }
+        vp9_write_bit(&header_bc, cm->use_masked_interintra);
+        if (cm->use_masked_interintra) {
+          for (i = 0; i < BLOCK_SIZES; i++) {
+            if (is_interintra_allowed(i) && get_mask_bits_interintra(i))
+              vp9_cond_prob_diff_update(&header_bc,
+                                        &fc->masked_interintra_prob[i],
+                                        cm->counts.masked_interintra[i]);
+          }
+        } else {
+          vp9_zero(cm->counts.masked_interintra);
+        }
+#endif
+      } else {
+        vp9_zero(cm->counts.interintra);
+      }
+    } else {
+      if (!cpi->dummy_packing)
+        cm->use_interintra = 0;
+      vp9_zero(cm->counts.interintra);
+#if CONFIG_MASKED_INTERINTRA
+      if (!cpi->dummy_packing)
+        cm->use_masked_interintra = 0;
+      vp9_zero(cm->counts.masked_interintra);
+#endif
+    }
+#endif
+
+#if CONFIG_COPY_CODING
+    for (i = 0; i < COPY_MODE_CONTEXTS; i++) {
+      prob_diff_update(vp9_copy_mode_tree_l2, cm->fc.copy_mode_probs_l2[i],
+                       cm->counts.copy_mode_l2[i], 2, &header_bc);
+      prob_diff_update(vp9_copy_mode_tree, cm->fc.copy_mode_probs[i],
+                       cm->counts.copy_mode[i], 3, &header_bc);
+    }
+#endif
  }

  vp9_stop_encode(&header_bc);
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -20,12 +20,6 @@
 extern "C" {
 #endif

-typedef struct {
-  unsigned int sse;
-  int sum;
-  unsigned int var;
-} diff;
-
 struct macroblock_plane {
  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
  int16_t *qcoeff;
@@ -35,7 +29,6 @@ struct macroblock_plane {

  // Quantizer setings
  int16_t *quant_fp;
-  int16_t *round_fp;
  int16_t *quant;
  int16_t *quant_shift;
  int16_t *zbin;
@@ -111,9 +104,6 @@ struct macroblock {
  int use_lp32x32fdct;
  int skip_encode;

-  // use fast quantization process
-  int quant_fp;
-
  // skip forward transform and quantization
  int skip_txfm;

--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -9,26 +9,21 @@
 */

 #include <assert.h>
-#include <limits.h>
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_denoiser.h"

-#ifdef OUTPUT_YUV_DENOISED
-static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
-#endif
-
 static const int widths[]  = {4, 4, 8, 8,  8, 16, 16, 16, 32, 32, 32, 64, 64};
 static const int heights[] = {4, 8, 4, 8, 16,  8, 16, 32, 16, 32, 64, 32, 64};

-static VP9_DENOISER_DECISION update_running_avg(const uint8_t *mc_avg,
-                                                int mc_avg_stride,
-                                                uint8_t *avg, int avg_stride,
-                                                const uint8_t *sig,
-                                                int sig_stride,
-                                                int increase_denoising,
-                                                BLOCK_SIZE bs) {
+int vp9_denoiser_filter() {
+  return 0;
+}
+
+static int update_running_avg(const uint8_t *mc_avg, int mc_avg_stride,
+                              uint8_t *avg, int avg_stride,
+                              const uint8_t *sig, int sig_stride,
+                              int increase_denoising, BLOCK_SIZE bs) {
  int r, c;
  int diff, adj, absdiff;
  int shift_inc1 = 0, shift_inc2 = 1;
@@ -93,144 +88,9 @@ void copy_block(uint8_t *dest, int dest_stride,
  }
 }

-static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
-                                                         MACROBLOCK *mb,
-                                                         BLOCK_SIZE bs,
-                                                         int increase_denoising,
-                                                         int mi_row,
-                                                         int mi_col) {
-  // constants
-  // TODO(tkopp): empirically determine good constants, or functions of block
-  // size.
-  int NOISE_MOTION_THRESHOLD = 25 * 25;
-  int SSE_DIFF_THRESHOLD = heights[bs] * widths[bs] * 20;
-  unsigned int SSE_THRESH = heights[bs] * widths[bs] * 40;
-  unsigned int SSE_THRESH_HI = heights[bs] * widths[bs] * 60;
-
-  int mv_col, mv_row;
-  int sse_diff = denoiser->zero_mv_sse - denoiser->best_sse;
-  int sse_diff_thresh;
-  int sse_thresh;
-  MV_REFERENCE_FRAME frame;
-  MACROBLOCKD *filter_mbd = &mb->e_mbd;
-  MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
-
-  // We will restore these after motion compensation.
-  MB_MODE_INFO saved_mbmi = *mbmi;
-  struct buf_2d saved_dst = filter_mbd->plane[0].dst;
-  struct buf_2d saved_pre[2];
-  saved_pre[0] = filter_mbd->plane[0].pre[0];
-  saved_pre[1] = filter_mbd->plane[0].pre[1];
-
-  // Decide the threshold for sum squared error.
-  mv_col = denoiser->best_sse_mv.as_mv.col;
-  mv_row = denoiser->best_sse_mv.as_mv.row;
-  if (mv_row * mv_row + mv_col * mv_col > NOISE_MOTION_THRESHOLD) {
-    sse_diff_thresh = 0;
-  } else {
-    sse_diff_thresh = SSE_DIFF_THRESHOLD;
-  }
-
-  frame = denoiser->best_reference_frame;
-
-  // If the best reference frame uses inter-prediction and there is enough of a
-  // difference in sum-squared-error, use it.
-  if (frame != INTRA_FRAME && sse_diff > sse_diff_thresh) {
-    mbmi->ref_frame[0] = denoiser->best_reference_frame;
-    mbmi->mode = denoiser->best_sse_inter_mode;
-    mbmi->mv[0] = denoiser->best_sse_mv;
-  } else {
-    // Otherwise, use the zero reference frame.
-    frame = denoiser->best_zeromv_reference_frame;
-
-    mbmi->ref_frame[0] = denoiser->best_zeromv_reference_frame;
-    mbmi->mode = ZEROMV;
-    mbmi->mv[0].as_int = 0;
-
-    denoiser->best_sse_inter_mode = ZEROMV;
-    denoiser->best_sse_mv.as_int = 0;
-    denoiser->best_sse = denoiser->zero_mv_sse;
-  }
-
-  // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
-  // struct.
-  filter_mbd->plane[0].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].y_buffer,
-                  denoiser->running_avg_y[frame].y_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[0].pre[0].stride = denoiser->running_avg_y[frame].y_stride;
-
-  filter_mbd->plane[1].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].u_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[1].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[2].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].v_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[0].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].y_buffer,
-                  denoiser->running_avg_y[frame].y_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[0].pre[1].stride = denoiser->running_avg_y[frame].y_stride;
-
-  filter_mbd->plane[1].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].u_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[1].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[2].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].v_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[2].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[0].dst.buf =
-      block_start(denoiser->mc_running_avg_y.y_buffer,
-                  denoiser->mc_running_avg_y.y_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-
-  filter_mbd->plane[1].dst.buf =
-      block_start(denoiser->mc_running_avg_y.u_buffer,
-                  denoiser->mc_running_avg_y.uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.y_stride;
-
-  filter_mbd->plane[2].dst.buf =
-      block_start(denoiser->mc_running_avg_y.v_buffer,
-                  denoiser->mc_running_avg_y.uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.y_stride;
-
-  vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
-
-  // Restore everything to its original state
-  filter_mbd->plane[0].pre[0] = saved_pre[0];
-  filter_mbd->plane[0].pre[1] = saved_pre[1];
-  filter_mbd->plane[0].dst = saved_dst;
-  *mbmi = saved_mbmi;
-
-  mv_row = denoiser->best_sse_mv.as_mv.row;
-  mv_col = denoiser->best_sse_mv.as_mv.col;
-  sse_thresh = denoiser->increase_denoising ? SSE_THRESH_HI : SSE_THRESH;
-
-  // TODO(tkopp) why 8?
-  if (denoiser->best_sse > sse_thresh ||
-    mv_row * mv_row + mv_col * mv_col > 8 * NOISE_MOTION_THRESHOLD) {
-    return COPY_BLOCK;
-  }
-  return FILTER_BLOCK;
-}
-
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                          int mi_row, int mi_col, BLOCK_SIZE bs) {
-  VP9_DENOISER_DECISION decision = FILTER_BLOCK;
+  int decision = COPY_BLOCK;

  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -239,9 +99,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                                          mi_row, mi_col);
  struct buf_2d src = mb->plane[0].src;

-  decision = perform_motion_compensation(denoiser, mb, bs,
-                                         denoiser->increase_denoising,
-                                         mi_row, mi_col);
+
  update_running_avg(mc_avg_start, mc_avg.y_stride, avg_start, avg.y_stride,
                     mb->plane[0].src.buf, mb->plane[0].src.stride, 0, bs);

@@ -298,25 +156,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
  }
 }

-void vp9_denoiser_reset_frame_stats(VP9_DENOISER *denoiser) {
-  denoiser->zero_mv_sse = UINT_MAX;
-  denoiser->best_sse = UINT_MAX;
-}
-
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
-                                     unsigned int sse, PREDICTION_MODE mode) {
-  // TODO(tkopp): Use both MVs if possible
-  if (mbmi->mv[0].as_int == 0 && sse < denoiser->zero_mv_sse) {
-    denoiser->zero_mv_sse = sse;
-    denoiser->best_zeromv_reference_frame = mbmi->ref_frame[0];
-  }
-
-  if (mbmi->mv[0].as_int != 0 && sse < denoiser->best_sse) {
-    denoiser->best_sse = sse;
-    denoiser->best_sse_inter_mode = mode;
-    denoiser->best_sse_mv = mbmi->mv[0];
-    denoiser->best_reference_frame = mbmi->ref_frame[0];
-  }
+void vp9_denoiser_update_frame_stats() {
 }

 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
@@ -331,9 +171,6 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
      vp9_denoiser_free(denoiser);
      return 1;
    }
-#ifdef OUTPUT_YUV_DENOISED
-    make_grayscale(&denoiser->running_avg_y[i]);
-#endif
  }

  fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
@@ -342,10 +179,6 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
    vp9_denoiser_free(denoiser);
    return 1;
  }
-#ifdef OUTPUT_YUV_DENOISED
-  make_grayscale(&denoiser->running_avg_y[i]);
-#endif
-  denoiser->increase_denoising = 0;

  return 0;
 }
@@ -364,22 +197,3 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
    vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
  }
 }
-
-#ifdef OUTPUT_YUV_DENOISED
-static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
-  int r, c;
-  uint8_t *u = yuv->u_buffer;
-  uint8_t *v = yuv->v_buffer;
-
-  // The '/2's are there because we have a 440 buffer, but we want to output
-  // 420.
-  for (r = 0; r < yuv->uv_height / 2; ++r) {
-    for (c = 0; c < yuv->uv_width / 2; ++c) {
-      u[c] = UINT8_MAX / 2;
-      v[c] = UINT8_MAX / 2;
-    }
-    u += yuv->uv_stride + yuv->uv_width / 2;
-    v += yuv->uv_stride + yuv->uv_width / 2;
-  }
-}
-#endif
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -18,22 +18,14 @@
 extern "C" {
 #endif

-typedef enum vp9_denoiser_decision {
+enum vp9_denoiser_decision {
  COPY_BLOCK,
  FILTER_BLOCK
-} VP9_DENOISER_DECISION;
+};

 typedef struct vp9_denoiser {
  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
  YV12_BUFFER_CONFIG mc_running_avg_y;
-
-  unsigned int zero_mv_sse;
-  unsigned int best_sse;
-  int increase_denoising;
-  PREDICTION_MODE best_sse_inter_mode;
-  int_mv best_sse_mv;
-  MV_REFERENCE_FRAME best_reference_frame;
-  MV_REFERENCE_FRAME best_zeromv_reference_frame;
 } VP9_DENOISER;

 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
@@ -46,10 +38,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                          int mi_row, int mi_col, BLOCK_SIZE bs);

-void vp9_denoiser_reset_frame_stats(VP9_DENOISER *denoiser);
-
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
-                                     unsigned int sse, PREDICTION_MODE mode);
+void vp9_denoiser_update_frame_stats();

 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
                       int ssx, int ssy, int border);
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -20,13 +20,6 @@ struct macroblock;
 struct yv12_buffer_config;
 struct VP9_COMP;

-// Constants used in SOURCE_VAR_BASED_PARTITION
-#define VAR_HIST_MAX_BG_VAR 1000
-#define VAR_HIST_FACTOR 10
-#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
-#define VAR_HIST_LARGE_CUT_OFF 75
-#define VAR_HIST_SMALL_CUT_OFF 45
-
 void vp9_setup_src_planes(struct macroblock *x,
                          const struct yv12_buffer_config *src,
                          int mi_row, int mi_col);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -306,56 +306,6 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
  MACROBLOCKD *const xd = &x->e_mbd;
  const struct macroblock_plane *const p = &x->plane[plane];
  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int i, j;
-  const int16_t *src_diff;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
-
-  switch (tx_size) {
-    case TX_32X32:
-      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
-                           scan_order->iscan);
-      break;
-    case TX_16X16:
-      vp9_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    case TX_8X8:
-      vp9_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    default:
-      assert(0);
-  }
-}
-
-void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -410,6 +360,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
  int i, j;
  const int16_t *src_diff;
+#if CONFIG_EXT_TX
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  src_diff = &p->src_diff[4 * (j * diff_stride + i)];

@@ -422,21 +375,45 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                           scan_order->iscan);
      break;
    case TX_16X16:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_fdct16x16(src_diff, coeff, diff_stride);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_fht16x16(src_diff, coeff, diff_stride, ADST_ADST);
+      }
+#endif
      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob,
                     scan_order->scan, scan_order->iscan);
      break;
    case TX_8X8:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_fdct8x8(src_diff, coeff, diff_stride);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_fht8x8(src_diff, coeff, diff_stride, ADST_ADST);
+      }
+#endif
      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob,
                     scan_order->scan, scan_order->iscan);
      break;
    case TX_4X4:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_fht4x4(src_diff, coeff, diff_stride, ADST_ADST);
+      }
+#endif
      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                     pd->dequant, p->zbin_extra, eob,
@@ -459,6 +436,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
  int i, j;
  uint8_t *dst;
  ENTROPY_CONTEXT *a, *l;
+#if CONFIG_EXT_TX
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
  a = &ctx->ta[plane][i];
@@ -474,15 +454,11 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,

  if (x->skip_txfm == 0) {
    // full forward transform and quantization
-    if (!x->skip_recode) {
-      if (x->quant_fp)
-        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
-      else
-        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-    }
+    if (!x->skip_recode)
+      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
  } else if (x->skip_txfm == 2) {
    // fast path forward transform and quantization
-    vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+    vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
  } else {
    // skip forward transform
    p->eobs[block] = 0;
@@ -508,16 +484,43 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
      break;
    case TX_16X16:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_iht16x16_add(ADST_ADST, dqcoeff, dst, pd->dst.stride,
+                         p->eobs[block]);
+      }
+#endif
      break;
    case TX_8X8:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_iht8x8_add(ADST_ADST, dqcoeff, dst, pd->dst.stride,
+                       p->eobs[block]);
+      }
+#endif
      break;
    case TX_4X4:
+#if CONFIG_EXT_TX
+      if (plane != 0 || mbmi->ext_txfrm == NORM) {
+#endif
      // this is like vp9_short_idct4x4 but has a special case around eob<=1
      // which is significant (not just an optimization) for the lossless
      // case.
      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+#if CONFIG_EXT_TX
+      } else {
+        vp9_iht4x4_add(ADST_ADST, dqcoeff, dst, pd->dst.stride,
+                       p->eobs[block]);
+      }
+#endif
      break;
    default:
      assert(0 && "Invalid transform size");
@@ -533,6 +536,10 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
  int i, j;
  uint8_t *dst;
+#if CONFIG_EXT_TX
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  mbmi->ext_txfrm = NORM;
+#endif
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];

@@ -561,7 +568,7 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {

    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
      const struct macroblockd_plane* const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
      vp9_get_entropy_contexts(bsize, tx_size, pd,
                               ctx.ta[plane], ctx.tl[plane]);
    }
@@ -571,6 +578,26 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
  }
 }

+#if CONFIG_SUPERTX
+void vp9_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    BLOCK_SIZE plane_size = bsize - 3 * (plane > 0);
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+    vp9_subtract_plane(x, bsize, plane);
+    vp9_get_entropy_contexts(bsize, tx_size, pd,
+                             ctx.ta[plane], ctx.tl[plane]);
+    encode_block(plane, 0, plane_size, bsize_to_tx_size(plane_size), &arg);
+  }
+}
+#endif
+
 static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               TX_SIZE tx_size, void *arg) {
  struct encode_b_args* const args = arg;
@@ -585,6 +612,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  const scan_order *scan_order;
  TX_TYPE tx_type;
  PREDICTION_MODE mode;
+#if CONFIG_FILTERINTRA
+  int fbit = 0;
+#endif
  const int bwl = b_width_log2(plane_bsize);
  const int diff_stride = 4 * (1 << bwl);
  uint8_t *src, *dst;
@@ -598,11 +628,20 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  src = &p->src.buf[4 * (j * src_stride + i)];
  src_diff = &p->src_diff[4 * (j * diff_stride + i)];

+#if CONFIG_FILTERINTRA
+      if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
+        fbit = xd->mi[0]->b_filter_info[block];
+      else
+        fbit = plane == 0 ? mbmi->filterbit : mbmi->uv_filterbit;
+#endif
  switch (tx_size) {
    case TX_32X32:
      scan_order = &vp9_default_scan_orders[TX_32X32];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
@@ -623,6 +662,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
      scan_order = &vp9_scan_orders[TX_16X16][tx_type];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
@@ -643,6 +685,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
      scan_order = &vp9_scan_orders[TX_8X8][tx_type];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
@@ -663,6 +708,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
      scan_order = &vp9_scan_orders[TX_4X4][tx_type];
      mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
      vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+#if CONFIG_FILTERINTRA
+                              fbit,
+#endif
                              x->skip_encode ? src : dst,
                              x->skip_encode ? src_stride : dst_stride,
                              dst, dst_stride, i, j, plane);
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -21,11 +21,12 @@ extern "C" {
 #endif

 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp9_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
-void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);

--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -194,18 +194,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
    lc->rc_twopass_stats_in.buf = NULL;
    lc->rc_twopass_stats_in.sz = 0;
  }
-
-  if (cpi->source_diff_var != NULL) {
-    vpx_free(cpi->source_diff_var);
-    cpi->source_diff_var = NULL;
-  }
-
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vpx_free(cpi->twopass.this_frame_mb_stats.mb_stats);
-    cpi->twopass.this_frame_mb_stats.mb_stats = NULL;
-  }
-#endif
 }

 static void save_coding_context(VP9_COMP *cpi) {
@@ -665,7 +653,12 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {

 #if CONFIG_DENOISING
  vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
-                     cm->subsampling_x, cm->subsampling_y,
+                     // TODO(tkopp) An unrelated bug causes
+                     // cm->subsampling_{x,y} to be uninitialized at this point
+                     // in execution. For now we assume YUV-420, which is x/y
+                     // subsampling of 1.
+                     1, 1,
+                     // cm->subsampling_x, cm->subsampling_y,
                     VP9_ENC_BORDER_IN_PIXELS);
 #endif
 }
@@ -713,8 +706,7 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
 }


-VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
-                                BufferPool *const pool) {
+VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
  unsigned int i, j;
  VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
  VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
@@ -735,7 +727,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
  vp9_rtcd();

  cpi->use_svc = 0;
-  cpi->common.buffer_pool = pool;

  init_config(cpi, oxcf);
  vp9_rc_init(&cpi->oxcf, cpi->pass, &cpi->rc);
@@ -749,8 +740,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
  cpi->alt_is_last = 0;
  cpi->gold_is_alt = 0;

-  cpi->skippable_frame = 0;
-
  // Create the encoder segmentation map and set all entries to 0
  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
@@ -775,17 +764,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
                               sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
  }

-#if CONFIG_FP_MB_STATS
-  cpi->use_fp_mb_stats = 0;
-  if (cpi->use_fp_mb_stats) {
-    // a place holder for the mb stats obtained from the first pass
-    CHECK_MEM_ERROR(cm, cpi->twopass.this_frame_mb_stats.mb_stats,
-                    vpx_calloc(cm->MBs * sizeof(FIRSTPASS_MB_STATS), 1));
-  } else {
-    cpi->twopass.this_frame_mb_stats.mb_stats = NULL;
-  }
-#endif
-
  cpi->refresh_alt_ref_frame = 0;

  // Note that at the moment multi_arf will not work with svc.
@@ -932,12 +910,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,

  set_speed_features(cpi);

-  // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                  vpx_calloc(cm->MBs, sizeof(diff)));
-  cpi->source_var_thresh = 0;
-  cpi->frames_till_next_var_check = 0;
-
  // Default rd threshold factors for mode selection
  for (i = 0; i < BLOCK_SIZES; ++i) {
    for (j = 0; j < MAX_MODES; ++j)
@@ -1013,6 +985,41 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
      vp9_sub_pixel_avg_variance4x4,
      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

+#if ((CONFIG_MASKED_INTERINTRA && CONFIG_INTERINTRA) || \
+    CONFIG_MASKED_INTERINTER)
+#define MBFP(BT, MSDF, MVF, MSVF) \
+  cpi->fn_ptr[BT].msdf            = MSDF; \
+  cpi->fn_ptr[BT].mvf             = MVF; \
+  cpi->fn_ptr[BT].msvf            = MSVF;
+
+  MBFP(BLOCK_64X64, vp9_masked_sad64x64, vp9_masked_variance64x64,
+       vp9_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, vp9_masked_sad64x32, vp9_masked_variance64x32,
+         vp9_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, vp9_masked_sad32x64, vp9_masked_variance32x64,
+         vp9_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, vp9_masked_sad32x32, vp9_masked_variance32x32,
+       vp9_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, vp9_masked_sad32x16, vp9_masked_variance32x16,
+       vp9_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, vp9_masked_sad16x32, vp9_masked_variance16x32,
+       vp9_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, vp9_masked_sad16x16, vp9_masked_variance16x16,
+         vp9_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, vp9_masked_sad16x8, vp9_masked_variance16x8,
+         vp9_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, vp9_masked_sad8x16, vp9_masked_variance8x16,
+         vp9_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, vp9_masked_sad8x8, vp9_masked_variance8x8,
+       vp9_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, vp9_masked_sad4x8, vp9_masked_variance4x8,
+       vp9_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, vp9_masked_sad8x4, vp9_masked_variance8x4,
+       vp9_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, vp9_masked_sad4x4, vp9_masked_variance4x4,
+       vp9_masked_sub_pixel_variance4x4)
+#endif
+
  cpi->full_search_sad = vp9_full_search_sad;
  cpi->diamond_search_sad = vp9_diamond_search_sad;
  cpi->refining_search_sad = vp9_refining_search_sad;
@@ -1275,7 +1282,7 @@ int vp9_get_reference_enc(VP9_COMP *cpi, int index, YV12_BUFFER_CONFIG **fb) {
  if (index < 0 || index >= REF_FRAMES)
    return -1;

-  *fb = &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
  return 0;
 }

@@ -1297,7 +1304,7 @@ int vp9_update_entropy(VP9_COMP * cpi, int update) {
 }


-#if defined(OUTPUT_YUV_SRC)
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
 void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s, FILE *f) {
  uint8_t *src = s->y_buffer;
  int h = s->y_height;
@@ -1325,38 +1332,6 @@ void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s, FILE *f) {
 }
 #endif

-#if defined(OUTPUT_YUV_DENOISED)
-// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
-// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
-// not denoise the UV channels at this time. If ever we implement UV channel
-// denoising we will have to modify this.
-void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, f);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height / 2;
-
-  do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height / 2;
-
-  do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
-  } while (--h);
-}
-#endif
-
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
@@ -1544,13 +1519,14 @@ static int recode_loop_test(const VP9_COMP *cpi,

 void vp9_update_reference_frames(VP9_COMP *cpi) {
  VP9_COMMON * const cm = &cpi->common;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  // At this point the new frame has been encoded.
  // If any buffer copy / swapping is signaled it should be done here.
  if (cm->frame_type == KEY_FRAME) {
-    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
-    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
  } else if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
             cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
    /* Preserve the previously existing golden frame and update the frame in
@@ -1564,7 +1540,8 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     */
    int tmp;

-    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);

    tmp = cpi->alt_fb_idx;
    cpi->alt_fb_idx = cpi->gld_fb_idx;
@@ -1577,17 +1554,19 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
        arf_idx = gf_group->arf_update_idx[gf_group->index];
      }

-      ref_cnt_fb(frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      ref_cnt_fb(cm->frame_bufs,
+                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
    }

    if (cpi->refresh_golden_frame) {
-      ref_cnt_fb(frame_bufs,
+      ref_cnt_fb(cm->frame_bufs,
                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
    }
  }

  if (cpi->refresh_last_frame) {
-    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
  }
 #if CONFIG_DENOISING
  vp9_denoiser_update_frame_info(&cpi->denoiser,
@@ -1628,36 +1607,34 @@ void vp9_scale_references(VP9_COMP *cpi) {
  VP9_COMMON *cm = &cpi->common;
  MV_REFERENCE_FRAME ref_frame;
  const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    const YV12_BUFFER_CONFIG *const ref = &frame_bufs[idx].buf;
+    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;

    // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
    if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) &&
        (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) {
      const int new_fb = get_free_fb(cm);
-      vp9_realloc_frame_buffer(&frame_bufs[new_fb].buf,
+      vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
                               cm->width, cm->height,
                               cm->subsampling_x, cm->subsampling_y,
                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
-      scale_and_extend_frame(ref, &frame_bufs[new_fb].buf);
+      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
      cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
    } else {
      cpi->scaled_ref_idx[ref_frame - 1] = idx;
-      ++frame_bufs[idx].ref_count;
+      cm->frame_bufs[idx].ref_count++;
    }
  }
 }

 static void release_scaled_references(VP9_COMP *cpi) {
  VP9_COMMON *cm = &cpi->common;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  int i;

-  for (i = 0; i < 3; ++i)
-    --frame_bufs[cpi->scaled_ref_idx[i]].ref_count;
+  for (i = 0; i < 3; i++)
+    cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--;
 }

 static void full_to_model_count(unsigned int *model_count,
@@ -2025,28 +2002,44 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
  }
 }

-static void configure_skippable_frame(VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-
-  SVC *const svc = &cpi->svc;
-  const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
-                             (svc->number_temporal_layers == 1);
-  TWO_PASS *const twopass = is_spatial_svc ?
-                            &svc->layer_context[svc->spatial_layer_id].twopass
-                            : &cpi->twopass;
-
-  cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) &&
-    twopass->stats_in - 2 > twopass->stats_in_start &&
-    twopass->stats_in < twopass->stats_in_end &&
-    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
-    == 1 &&
-    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
-    == 1 &&
-    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+#if CONFIG_MASKED_INTERINTER
+static void select_masked_interinter_mode(VP9_COMP *cpi) {
+  static const double threshold = 1/128.0;
+  VP9_COMMON *cm = &cpi->common;
+  int sum = cpi->masked_interinter_select_counts[1] +
+      cpi->masked_interinter_select_counts[0];
+  if (sum) {
+    double fraction = (double) cpi->masked_interinter_select_counts[1] / sum;
+    cm->use_masked_interinter = (fraction > threshold);
+  }
 }
+#endif
+
+#if CONFIG_INTERINTRA
+static void select_interintra_mode(VP9_COMP *cpi) {
+  static const double threshold = 0.007;
+  VP9_COMMON *cm = &cpi->common;
+  int sum = cpi->interintra_select_count[1] + cpi->interintra_select_count[0];
+  if (sum) {
+    double fraction = (double)cpi->interintra_select_count[1] / (double)sum;
+    cm->use_interintra = (fraction > threshold);
+  }
+}
+
+#if CONFIG_MASKED_INTERINTRA
+static void select_masked_interintra_mode(VP9_COMP *cpi) {
+  static const double threshold = 1/100.0;
+  VP9_COMMON *cm = &cpi->common;
+  int sum = cpi->masked_interintra_select_count[1] +
+      cpi->masked_interintra_select_count[0];
+  if (sum) {
+    double fraction = (double)cpi->masked_interintra_select_count[1] /
+                      (double)sum;
+    cm->use_masked_interintra = (fraction > threshold);
+  }
+}
+#endif
+#endif

 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                      size_t *size,
@@ -2143,13 +2136,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
  if (cpi->pass == 2 && cpi->sf.static_segmentation)
    configure_static_seg_features(cpi);

-  // Check if the current frame is skippable for the partition search in the
-  // second pass according to the first pass stats
-  if (cpi->pass == 2 &&
-      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
-    configure_skippable_frame(cpi);
-  }
-
  // For 1 pass CBR, check if we are dropping this frame.
  // Never drop on key frame.
  if (cpi->pass == 0 &&
@@ -2190,7 +2176,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #endif

 #ifdef OUTPUT_YUV_DENOISED
-  vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
+  vp9_write_yuv_frame(&cpi->denoiser.running_avg_y[INTRA_FRAME],
                      yuv_denoised_file);
 #endif
 #ifdef OUTPUT_YUV_SRC
@@ -2208,6 +2194,20 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
    vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
  }

+#if CONFIG_MASKED_INTERINTER
+  if (cm->current_video_frame == 0)
+    cm->use_masked_interinter = 0;
+#endif
+
+#if CONFIG_INTERINTRA
+  if (cm->current_video_frame == 0) {
+    cm->use_interintra = 1;
+#if CONFIG_MASKED_INTERINTRA
+    cm->use_masked_interintra = 1;
+#endif
+  }
+#endif
+
  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
    encode_without_recode_loop(cpi, q);
  } else {
@@ -2269,6 +2269,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
      vp9_adapt_mode_probs(cm);
      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
    }
+#if CONFIG_MASKED_INTERINTER
+    select_masked_interinter_mode(cpi);
+#endif
+#if CONFIG_INTERINTRA
+    select_interintra_mode(cpi);
+#if CONFIG_MASKED_INTERINTRA
+    if (cpi->common.use_interintra)
+      select_masked_interintra_mode(cpi);
+    else
+      cpi->common.use_masked_interintra = 0;
+#endif
+#endif
  }

  if (cpi->refresh_golden_frame == 1)
@@ -2518,9 +2530,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
  MV_REFERENCE_FRAME ref_frame;
  int arf_src_index;
  const int is_spatial_svc = cpi->use_svc &&
-                             (cpi->svc.number_temporal_layers == 1) &&
-                             (cpi->svc.number_spatial_layers > 1);
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+                             (cpi->svc.number_temporal_layers == 1);

  if (!cpi)
    return -1;
@@ -2603,8 +2613,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
      cm->show_frame = 1;
      cm->intra_only = 0;

-      // Check to see if the frame to be encoded is an overlay for a previous
-      // arf frame and if so configure it as such.
+      // Check to see if the frame should be encoded as an arf overlay.
      check_src_altref(cpi);
    }
  }
@@ -2658,7 +2667,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
  /* find a free buffer for the new frame, releasing the reference previously
   * held.
   */
-  --frame_bufs[cm->new_fb_idx].ref_count;
+  cm->frame_bufs[cm->new_fb_idx].ref_count--;
  cm->new_fb_idx = get_free_fb(cm);

  if (!cpi->use_svc && cpi->multi_arf_allowed) {
@@ -2692,7 +2701,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,

  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const buf = &frame_bufs[idx].buf;
+    YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
    ref_buf->buf = buf;
    ref_buf->idx = idx;
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -64,7 +64,6 @@ typedef struct {
  FRAME_CONTEXT fc;
 } CODING_CONTEXT;

-
 typedef enum {
  // encode_breakout is disabled.
  ENCODE_BREAKOUT_DISABLED = 0,
@@ -262,8 +261,6 @@ typedef struct VP9_COMP {
  int alt_is_last;  // Alt same as last ( short circuit altref search)
  int gold_is_alt;  // don't do both alt and gold search ( just do gold).

-  int skippable_frame;
-
  int scaled_ref_idx[3];
  int lst_fb_idx;
  int gld_fb_idx;
@@ -348,10 +345,6 @@ typedef struct VP9_COMP {
  uint64_t time_pick_lpf;
  uint64_t time_encode_sb_row;

-#if CONFIG_FP_MB_STATS
-  int use_fp_mb_stats;
-#endif
-
  TWO_PASS twopass;

  YV12_BUFFER_CONFIG alt_ref_buffer;
@@ -405,11 +398,7 @@ typedef struct VP9_COMP {

  SVC svc;

-  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
-  diff *source_diff_var;
-  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
-  unsigned int source_var_thresh;
-  int frames_till_next_var_check;
+  int use_large_partition_rate;

  int frame_flags;

@@ -420,6 +409,10 @@ typedef struct VP9_COMP {
  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_COPY_CODING
+  int copy_mode_cost_l2[COPY_MODE_CONTEXTS][2];
+  int copy_mode_cost[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 1];
+#endif

  PICK_MODE_CONTEXT *leaf_tree;
  PC_TREE *pc_tree;
@@ -432,12 +425,21 @@ typedef struct VP9_COMP {
 #if CONFIG_DENOISING
  VP9_DENOISER denoiser;
 #endif
+
+#if CONFIG_MASKED_INTERINTER
+  unsigned int masked_interinter_select_counts[2];
+#endif
+#if CONFIG_INTERINTRA
+  unsigned int interintra_select_count[2];
+#if CONFIG_MASKED_INTERINTRA
+  unsigned int masked_interintra_select_count[2];
+#endif
+#endif
 } VP9_COMP;

 void vp9_initialize_enc();

-struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
-                                       BufferPool *const pool);
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
 void vp9_remove_compressor(VP9_COMP *cpi);

 void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
@@ -495,9 +497,8 @@ static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,

 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON *const cm = &cpi->common;
-  BufferPool *const pool = cm->buffer_pool;
-  return &pool->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
+  VP9_COMMON * const cm = &cpi->common;
+  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
      .buf;
 }

--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -106,34 +106,6 @@ static int read_frame_stats(const TWO_PASS *p,
  return 1;
 }

-#if CONFIG_FP_MB_STATS
-static int input_mb_stats(FIRSTPASS_FRAME_MB_STATS *fp_frame_stats,
-                          const VP9_COMMON *const cm) {
-  FILE *fpfile;
-  int ret;
-
-  fpfile = fopen("firstpass_mb.stt", "r");
-  fseek(fpfile, cm->current_video_frame * cm->MBs * sizeof(FIRSTPASS_MB_STATS),
-        SEEK_SET);
-  ret = fread(fp_frame_stats->mb_stats, sizeof(FIRSTPASS_MB_STATS), cm->MBs,
-              fpfile);
-  fclose(fpfile);
-  if (ret < cm->MBs) {
-    return EOF;
-  }
-  return 1;
-}
-
-static void output_mb_stats(FIRSTPASS_FRAME_MB_STATS *fp_frame_stats,
-                          const VP9_COMMON *const cm) {
-  FILE *fpfile;
-
-  fpfile = fopen("firstpass_mb.stt", "a");
-  fwrite(fp_frame_stats->mb_stats, sizeof(FIRSTPASS_MB_STATS), cm->MBs, fpfile);
-  fclose(fpfile);
-}
-#endif
-
 static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
  if (p->stats_in >= p->stats_in_end)
    return EOF;
@@ -480,10 +452,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
  const MV zero_mv = {0, 0};
  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;

-#if CONFIG_FP_MB_STATS
-  FIRSTPASS_FRAME_MB_STATS *this_frame_mb_stats = &twopass->this_frame_mb_stats;
-#endif
-
  vp9_clear_system_state();

  set_first_pass_params(cpi);
@@ -611,17 +579,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
      // Accumulate the intra error.
      intra_error += (int64_t)this_error;

-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mode =
-            DC_PRED;
-        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].err =
-            this_error;
-        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mv.as_int
-            = 0;
-      }
-#endif
-
      // Set up limit values for motion vectors to prevent them extending
      // outside the UMV borders.
      x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
@@ -747,17 +704,6 @@ void vp9_first_pass(VP9_COMP *cpi) {

          best_ref_mv.as_int = mv.as_int;

-#if CONFIG_FP_MB_STATS
-          if (cpi->use_fp_mb_stats) {
-            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mode =
-                NEWMV;
-            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].err =
-                motion_error;
-            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mv.
-                as_int = mv.as_int;
-          }
-#endif
-
          if (mv.as_int) {
            ++mvcount;

@@ -862,12 +808,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
    twopass->this_frame_stats = fps;
    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
    accumulate_stats(&twopass->total_stats, &fps);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      output_mb_stats(this_frame_mb_stats, cm);
-    }
-#endif
  }

  // Copy the previous Last Frame back into gf and and arf buffers if
@@ -1420,8 +1360,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
    twopass->gf_group.arf_src_offset[frame_index] =
      (unsigned char)(rc->baseline_gf_interval - 1);
    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-    twopass->gf_group.arf_ref_idx[frame_index] =
-      arf_buffer_indices[cpi->multi_arf_enabled && rc->source_alt_ref_active];
+    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
    ++frame_index;

    if (cpi->multi_arf_enabled) {
@@ -2227,12 +2166,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {

  // Update the total stats remaining structure.
  subtract_stats(&twopass->total_left_stats, &this_frame);
-
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    input_mb_stats(&twopass->this_frame_mb_stats, cm);
-  }
-#endif
 }

 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -18,18 +18,6 @@
 extern "C" {
 #endif

-#if CONFIG_FP_MB_STATS
-typedef struct {
-  PREDICTION_MODE mode;
-  int err;
-  int_mv mv;
-} FIRSTPASS_MB_STATS;
-
-typedef struct {
-  FIRSTPASS_MB_STATS *mb_stats;
-} FIRSTPASS_FRAME_MB_STATS;
-#endif
-
 typedef struct {
  double frame;
  double intra_error;
@@ -88,10 +76,6 @@ typedef struct {
  double kf_intra_err_min;
  double gf_intra_err_min;

-#if CONFIG_FP_MB_STATS
-  FIRSTPASS_FRAME_MB_STATS this_frame_mb_stats;
-#endif
-
  // Projected total bits available for a key frame group of frames
  int64_t kf_group_bits;

--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -64,6 +64,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
  xd->mi[0]->mbmi.mode = NEWMV;
  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;

+#if CONFIG_INTERINTRA
+  xd->mi[0]->mbmi.ref_frame[1] = NONE;
+#endif
  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);

  /* restore UMV window */
@@ -141,6 +144,9 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {

    xd->mi[0]->mbmi.mode = mode;
    vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
+#if CONFIG_FILTERINTRA
+                            0,
+#endif
                            x->plane[0].src.buf, x->plane[0].src.stride,
                            xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                            0, 0, 0);
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1640,3 +1640,354 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,

  return var;
 }
+
+#if ((CONFIG_MASKED_INTERINTRA && CONFIG_INTERINTRA) || \
+    CONFIG_MASKED_INTERINTER)
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+              src_stride, mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+
+#define MVC(r, c)                                       \
+    (mvcost ?                                           \
+     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
+      error_per_bit + 4096) >> 13 : 0)
+
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = (DIST(r, c));                                            \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
+                                        uint8_t *mask, int mask_stride,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1, int is_second) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int whichdir;
+  int thismse;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
+
+  const int y_stride = xd->plane[0].pre[is_second].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[is_second].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+
+  int tr = br;
+  int tc = bc;
+
+  // central mv
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // calculate central point error
+  besterr = vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride,
+                     sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+  // 1/2 pel
+  FIRST_LEVEL_CHECKS;
+  if (halfiters > 1) {
+    SECOND_LEVEL_CHECKS;
+  }
+  tr = br;
+  tc = bc;
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+int vp9_get_masked_mvpred_var(const MACROBLOCK *x,
+                              uint8_t *mask, int mask_stride,
+                              const MV *best_mv, const MV *center_mv,
+                              const vp9_variance_fn_ptr_t *vfp,
+                              int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->mvf(what->buf, what->stride,
+                  get_buf_from_mv(in_what, best_mv), in_what->stride,
+                  mask, mask_stride, &unused) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+int vp9_masked_refining_search_sad_c(const MACROBLOCK *x,
+                                     uint8_t *mask, int mask_stride,
+                                     MV *ref_mv, int error_per_bit,
+                                     int search_range,
+                                     const vp9_variance_fn_ptr_t *fn_ptr,
+                                     const MV *center_mv, int is_second) {
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->msdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride, mask, mask_stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->msdf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int vp9_masked_diamond_search_sad_c(const MACROBLOCK *x,
+                             const search_site_config *cfg,
+                             uint8_t *mask, int mask_stride,
+                             MV *ref_mv, MV *best_mv,
+                             int search_param, int sad_per_bit, int *num00,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = get_buf_from_mv(in_what, ref_mv);
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->msdf(what->buf, what->stride,
+                         best_address, in_what->stride,
+                         mask, mask_stride) +
+      mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->msdf(what->buf, what->stride,
+                              best_address + ss[i].offset, in_what->stride,
+                              mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->msdf(what->buf, what->stride,
+                                 best_address + ss[best_site].offset,
+                                 in_what->stride, mask, mask_stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int vp9_masked_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
+                                  uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param,
+                                  int sadpb, int further_steps, int do_refine,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv,
+                                  int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = vp9_masked_diamond_search_sad_c(x, &cpi->ss_cfg,
+                                                mask, mask_stride,
+                                                mvp_full, &temp_mv,
+                                                step_param, sadpb, &n,
+                                                fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+                                        fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = vp9_masked_diamond_search_sad_c(x, &cpi->ss_cfg,
+                                                mask, mask_stride,
+                                                mvp_full, &temp_mv,
+                                                step_param + n, sadpb, &num00,
+                                                fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = vp9_get_masked_mvpred_var(x, mask, mask_stride,
+                                            &temp_mv, ref_mv, fn_ptr, 1,
+                                            is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = vp9_masked_refining_search_sad_c(x, mask, mask_stride,
+                                               &best_mv, sadpb, search_range,
+                                               fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = vp9_get_masked_mvpred_var(x, mask, mask_stride,
+                                          &best_mv, ref_mv, fn_ptr, 1,
+                                          is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+#endif
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -153,6 +153,29 @@ int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
                          int step_param, int error_per_bit,
                          const MV *ref_mv, MV *tmp_mv,
                          int var_max, int rd);
+
+#if ((CONFIG_MASKED_INTERINTRA && CONFIG_INTERINTRA) || \
+    CONFIG_MASKED_INTERINTER)
+int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
+                                        uint8_t *mask, int mask_stride,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1, int is_second);
+int vp9_masked_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
+                                  uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param,
+                                  int sadpb, int further_steps, int do_refine,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv,
+                                  int is_second);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -144,9 +144,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                    : cpi->oxcf.sharpness;

-  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
-      lf->filter_level = 0;
-  } else if (method >= LPF_PICK_FROM_Q) {
+  if (method == LPF_PICK_FROM_Q) {
    const int min_filter_level = 0;
    const int max_filter_level = get_max_filter_level(cpi);
    const int q = vp9_ac_quant(cm->base_qindex, 0);
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -252,17 +252,6 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
  else
    x->skip_txfm = 0;

-  if (cpi->common.tx_mode == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
-                          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    else
-      xd->mi[0]->mbmi.tx_size = TX_8X8;
-  } else {
-    xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
-                         tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  }
-
  vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
                               dc_quant >> 3, &rate, &dist);
  *out_rate_sum = rate >> 1;
@@ -290,93 +279,6 @@ static void free_pred_buffer(PRED_BUFFER *p) {
  p->in_use = 0;
 }

-static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 MV_REFERENCE_FRAME ref_frame,
-                                 PREDICTION_MODE this_mode,
-                                 unsigned int var_y, unsigned int sse_y,
-                                 struct buf_2d yv12_mb[][MAX_MB_PLANE],
-                                 int *rate, int64_t *dist) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-  const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
-  unsigned int var = var_y, sse = sse_y;
-  // Skipping threshold for ac.
-  unsigned int thresh_ac;
-  // Skipping threshold for dc.
-  unsigned int thresh_dc;
-  if (x->encode_breakout > 0) {
-    // Set a maximum for threshold to avoid big PSNR loss in low bit rate
-    // case. Use extreme low threshold for static frames to limit
-    // skipping.
-    const unsigned int max_thresh = 36000;
-    // The encode_breakout input
-    const unsigned int min_thresh =
-        MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
-
-    // Calculate threshold according to dequant value.
-    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-    thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
-
-    // Adjust ac threshold according to partition size.
-    thresh_ac >>=
-        8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-
-    thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
-  } else {
-    thresh_ac = 0;
-    thresh_dc = 0;
-  }
-
-  // Y skipping condition checking for ac and dc.
-  if (var <= thresh_ac && (sse - var) <= thresh_dc) {
-    unsigned int sse_u, sse_v;
-    unsigned int var_u, var_v;
-
-    // Skip UV prediction unless breakout is zero (lossless) to save
-    // computation with low impact on the result
-    if (x->encode_breakout == 0) {
-      xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
-      xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
-      vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
-    }
-
-    var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
-                                    x->plane[1].src.stride,
-                                    xd->plane[1].dst.buf,
-                                    xd->plane[1].dst.stride, &sse_u);
-
-    // U skipping condition checking
-    if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
-      var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
-                                      x->plane[2].src.stride,
-                                      xd->plane[2].dst.buf,
-                                      xd->plane[2].dst.stride, &sse_v);
-
-      // V skipping condition checking
-      if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
-        x->skip = 1;
-
-        // The cost of skip bit needs to be added.
-        *rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                     [INTER_OFFSET(this_mode)];
-
-        // More on this part of rate
-        // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-
-        // Scaling factor for SSE from spatial domain to frequency
-        // domain is 16. Adjust distortion accordingly.
-        // TODO(yunqingwang): In this function, only y-plane dist is
-        // calculated.
-        *dist = (sse << 4);  // + ((sse_u + sse_v) << 4);
-
-        // *disable_skip = 1;
-      }
-    }
-  }
-}
-
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -391,8 +293,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  struct macroblockd_plane *const pd = &xd->plane[0];
  PREDICTION_MODE this_mode, best_mode = ZEROMV;
  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
-  TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
-                             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
  INTERP_FILTER best_pred_filter = EIGHTTAP;
  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -420,10 +320,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
  // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame.
  int mode_idx[MB_MODE_COUNT] = {0};
-  INTERP_FILTER filter_ref = cm->interp_filter;
+  INTERP_FILTER filter_ref = SWITCHABLE;
  int bsl = mi_width_log2_lookup[bsize];
-  const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
-      (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0;
+  const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
+                                      get_chessboard_index(cm)) % 2;
  int const_motion[MAX_REF_FRAMES] = { 0 };

  // For speed 6, the result of interp filter is reused later in actual encoding
@@ -439,10 +339,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  PRED_BUFFER *this_mode_pred = NULL;
  int i;

-#if CONFIG_DENOISING
-  vp9_denoiser_reset_frame_stats(&cpi->denoiser);
-#endif
-
  if (cpi->sf.reuse_inter_pred_sby) {
    for (i = 0; i < 3; i++) {
      tmp[i].data = &pred_buf[pixels_in_block * i];
@@ -590,7 +486,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        int64_t pf_dist[3];
        unsigned int pf_var[3];
        unsigned int pf_sse[3];
-        TX_SIZE pf_tx_size[3];
        int64_t best_cost = INT64_MAX;
        INTERP_FILTER best_filter = SWITCHABLE, filter;
        PRED_BUFFER *current_pred = this_mode_pred;
@@ -604,7 +499,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          cost = RDCOST(x->rdmult, x->rddiv,
                        vp9_get_switchable_rate(cpi) + pf_rate[filter],
                        pf_dist[filter]);
-          pf_tx_size[filter] = mbmi->tx_size;
          if (cost < best_cost) {
            best_filter = filter;
            best_cost = cost;
@@ -629,7 +523,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          free_pred_buffer(current_pred);

        mbmi->interp_filter = best_filter;
-        mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
        rate = pf_rate[mbmi->interp_filter];
        dist = pf_dist[mbmi->interp_filter];
        var_y = pf_var[mbmi->interp_filter];
@@ -649,16 +542,80 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      // Skipping checking: test to see if this block can be reconstructed by
      // prediction only.
      if (cpi->allow_encode_breakout) {
-        encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
-                             this_mode, var_y, sse_y, yv12_mb, &rate, &dist);
-        if (x->skip) {
-          rate += rate_mv;
-          this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+        const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+        unsigned int var = var_y, sse = sse_y;
+        // Skipping threshold for ac.
+        unsigned int thresh_ac;
+        // Skipping threshold for dc.
+        unsigned int thresh_dc;
+        if (x->encode_breakout > 0) {
+          // Set a maximum for threshold to avoid big PSNR loss in low bit rate
+          // case. Use extreme low threshold for static frames to limit
+          // skipping.
+          const unsigned int max_thresh = 36000;
+          // The encode_breakout input
+          const unsigned int min_thresh =
+              MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+
+          // Calculate threshold according to dequant value.
+          thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+          thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+
+          // Adjust ac threshold according to partition size.
+          thresh_ac >>=
+              8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+          thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+        } else {
+          thresh_ac = 0;
+          thresh_dc = 0;
+        }
+
+        // Y skipping condition checking for ac and dc.
+        if (var <= thresh_ac && (sse - var) <= thresh_dc) {
+          unsigned int sse_u, sse_v;
+          unsigned int var_u, var_v;
+
+          // Skip u v prediction for less calculation, that won't affect
+          // result much.
+          var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                          x->plane[1].src.stride,
+                                          xd->plane[1].dst.buf,
+                                          xd->plane[1].dst.stride, &sse_u);
+
+          // U skipping condition checking
+          if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+            var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                            x->plane[2].src.stride,
+                                            xd->plane[2].dst.buf,
+                                            xd->plane[2].dst.stride, &sse_v);
+
+            // V skipping condition checking
+            if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+              x->skip = 1;
+
+              // The cost of skip bit needs to be added.
+              rate = rate_mv;
+              rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                           [INTER_OFFSET(this_mode)];
+
+              // More on this part of rate
+              // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+              // Scaling factor for SSE from spatial domain to frequency
+              // domain is 16. Adjust distortion accordingly.
+              // TODO(yunqingwang): In this function, only y-plane dist is
+              // calculated.
+              dist = (sse << 4);  // + ((sse_u + sse_v) << 4);
+              this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+              // *disable_skip = 1;
+            }
+          }
        }
      }

 #if CONFIG_DENOISING
-      vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y, this_mode);
+    vp9_denoiser_update_frame_stats();
 #endif

      if (this_rd < best_rd || x->skip) {
@@ -667,7 +624,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        *returndistortion = dist;
        best_mode = this_mode;
        best_pred_filter = mbmi->interp_filter;
-        best_tx_size = mbmi->tx_size;
        best_ref_frame = ref_frame;
        skip_txfm = x->skip_txfm;

@@ -701,11 +657,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                      bw, bh);
  }

-  mbmi->mode          = best_mode;
+  mbmi->mode = best_mode;
  mbmi->interp_filter = best_pred_filter;
-  mbmi->tx_size       = best_tx_size;
-  mbmi->ref_frame[0]  = best_ref_frame;
-  mbmi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
+  mbmi->ref_frame[0] = best_ref_frame;
+  mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
  xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
  x->skip_txfm = skip_txfm;

@@ -714,6 +669,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  if (!x->skip && best_rd > inter_mode_thresh &&
      bsize <= cpi->sf.max_intra_bsize) {
    int i, j;
+    const int step   = 1 << mbmi->tx_size;
    const int width  = num_4x4_blocks_wide_lookup[bsize];
    const int height = num_4x4_blocks_high_lookup[bsize];

@@ -723,10 +679,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    const int src_stride = p->src.stride;
    int block_idx = 0;

-    TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
-                              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    const int step = 1 << tmp_tx_size;
-
    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
      if (cpi->sf.reuse_inter_pred_sby) {
        pd->dst.buf = tmp[0].data;
@@ -736,7 +688,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      for (j = 0; j < height; j += step) {
        for (i = 0; i < width; i += step) {
          vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
-                                  tmp_tx_size, this_mode,
+                                  mbmi->tx_size, this_mode,
+#if CONFIG_FILTERINTRA
+                                  0,
+#endif
                                  &p->src.buf[4 * (j * dst_stride + i)],
                                  src_stride,
                                  &pd->dst.buf[4 * (j * dst_stride + i)],
@@ -747,7 +702,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          ++block_idx;
        }
      }
-
      rate = rate2;
      dist = dist2;

@@ -763,7 +717,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        *returnrate = rate;
        *returndistortion = dist;
        mbmi->mode = this_mode;
-        mbmi->tx_size = tmp_tx_size;
        mbmi->ref_frame[0] = INTRA_FRAME;
        mbmi->uv_mode = this_mode;
        mbmi->mv[0].as_int = INVALID_MV;
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -42,9 +42,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
 }

 void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                     const int16_t *round_ptr, const int16_t quant,
+                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
  int eob = -1;

  if (!skip_block) {
@@ -63,47 +63,6 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
  *eob_ptr = eob + 1;
 }

-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
-                       int skip_block,
-                       const int16_t *zbin_ptr, const int16_t *round_ptr,
-                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                       const int16_t *dequant_ptr,
-                       int zbin_oq_value, uint16_t *eob_ptr,
-                       const int16_t *scan, const int16_t *iscan) {
-  int i, eob = -1;
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)zbin_oq_value;
-  (void)iscan;
-
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
-
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-      if (tmp)
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                      int skip_block,
                      const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -248,16 +207,11 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    const int qrounding_factor = q == 0 ? 64 : 48;

    for (i = 0; i < 2; ++i) {
-      int qrounding_factor_fp = i == 0 ? 48 : 42;
-      if (q == 0)
-        qrounding_factor_fp = 64;
-
      // y
      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
                     : vp9_ac_quant(q, 0);
      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
      quants->y_quant_fp[q][i] = (1 << 16) / quant;
-      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
      cm->y_dequant[q][i] = quant;
@@ -268,7 +222,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
      invert_quant(&quants->uv_quant[q][i],
                   &quants->uv_quant_shift[q][i], quant);
      quants->uv_quant_fp[q][i] = (1 << 16) / quant;
-      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
      cm->uv_dequant[q][i] = quant;
@@ -287,7 +240,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    for (i = 2; i < 8; i++) {
      quants->y_quant[q][i] = quants->y_quant[q][1];
      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
-      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
      quants->y_zbin[q][i] = quants->y_zbin[q][1];
      quants->y_round[q][i] = quants->y_round[q][1];
@@ -295,7 +247,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {

      quants->uv_quant[q][i] = quants->uv_quant[q][1];
      quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
-      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
      quants->uv_round[q][i] = quants->uv_round[q][1];
@@ -325,7 +276,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
  // Y
  x->plane[0].quant = quants->y_quant[qindex];
  x->plane[0].quant_fp = quants->y_quant_fp[qindex];
-  x->plane[0].round_fp = quants->y_round_fp[qindex];
  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
  x->plane[0].zbin = quants->y_zbin[qindex];
  x->plane[0].round = quants->y_round[qindex];
@@ -336,7 +286,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
  for (i = 1; i < 3; i++) {
    x->plane[i].quant = quants->uv_quant[qindex];
    x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
-    x->plane[i].round_fp = quants->uv_round_fp[qindex];
    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
    x->plane[i].zbin = quants->uv_zbin[qindex];
    x->plane[i].round = quants->uv_round[qindex];
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -28,8 +28,6 @@ typedef struct {
  // if we want to deprecate the current use of y_quant.
  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
  DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);

  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -33,7 +33,11 @@ extern "C" {

 #define INVALID_MV 0x80008000

+#if !CONFIG_INTERINTRA
 #define MAX_MODES 30
+#else
+#define MAX_MODES 42
+#endif
 #define MAX_REFS  6

 // This enumerator type needs to be kept aligned with the mode order in
@@ -76,6 +80,23 @@ typedef enum {
  THR_D63_PRED,
  THR_D117_PRED,
  THR_D45_PRED,
+
+#if CONFIG_INTERINTRA
+  THR_COMP_INTERINTRA_ZEROL,
+  THR_COMP_INTERINTRA_NEARESTL,
+  THR_COMP_INTERINTRA_NEARL,
+  THR_COMP_INTERINTRA_NEWL,
+
+  THR_COMP_INTERINTRA_ZEROG,
+  THR_COMP_INTERINTRA_NEARESTG,
+  THR_COMP_INTERINTRA_NEARG,
+  THR_COMP_INTERINTRA_NEWG,
+
+  THR_COMP_INTERINTRA_ZEROA,
+  THR_COMP_INTERINTRA_NEARESTA,
+  THR_COMP_INTERINTRA_NEARA,
+  THR_COMP_INTERINTRA_NEWA,
+#endif
 } THR_MODES;

 typedef enum {
@@ -150,6 +171,9 @@ int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                  const struct TileInfo *const tile,
                                  int mi_row, int mi_col,
                                  int *returnrate,
+#if CONFIG_SUPERTX
+                                  int *returnrate_nocoef,
+#endif
                                  int64_t *returndistortion,
                                  BLOCK_SIZE bsize,
                                  PICK_MODE_CONTEXT *ctx,
@@ -157,6 +181,8 @@ int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,

 int64_t vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
                                           struct macroblock *x,
+                                           const TileInfo *const tile,
+                                           int mi_row, int mi_col,
                                           int *returnrate,
                                           int64_t *returndistortion,
                                           BLOCK_SIZE bsize,
@@ -168,6 +194,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                      const struct TileInfo *const tile,
                                      int mi_row, int mi_col,
                                      int *returnrate,
+#if CONFIG_SUPERTX
+                                      int *returnrate_nocoef,
+#endif
                                      int64_t *returndistortion,
                                      BLOCK_SIZE bsize,
                                      PICK_MODE_CONTEXT *ctx,
@@ -199,6 +228,21 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
                          int mi_row, int mi_col,
                          const struct scale_factors *scale,
                          const struct scale_factors *scale_uv);
+
+#if CONFIG_SUPERTX
+void txfm_rd_in_plane_supertx(MACROBLOCK *x,
+                              int *rate, int64_t *distortion,
+                              int *skippable, int64_t *sse,
+                              int64_t ref_best_rd, int plane,
+                              BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              int use_fast_coef_casting);
+void txfm_rd_in_plane(MACROBLOCK *x,
+                      int *rate, int64_t *distortion,
+                      int *skippable, int64_t *sse,
+                      int64_t ref_best_rd, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size,
+                      int use_fast_coef_casting);
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -131,3 +131,47 @@ sadMxN(4, 4)
 sadMxNxK(4, 4, 3)
 sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
+
+#if ((CONFIG_MASKED_INTERINTRA && CONFIG_INTERINTRA) || \
+     CONFIG_MASKED_INTERINTER)
+static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride,
+                                      int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += m[x] * abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define MASKSADMxN(m, n) \
+unsigned int vp9_masked_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         const uint8_t *msk, int msk_stride) { \
+  return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, n); \
+}
+
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+#endif
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -27,8 +27,6 @@ void vp9_enable_segmentation(struct segmentation *seg) {

 void vp9_disable_segmentation(struct segmentation *seg) {
  seg->enabled = 0;
-  seg->update_map = 0;
-  seg->update_data = 0;
 }

 void vp9_set_segment_data(struct segmentation *seg,
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -270,25 +270,16 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
    // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
    sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
    sf->search_type_check_frequency = 50;
+    sf->source_var_thresh = 360;

-    sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
-        USE_LARGESTALL : USE_TX_8X8;
+    sf->tx_size_search_method = USE_TX_8X8;
    sf->max_intra_bsize = BLOCK_8X8;

    // This feature is only enabled when partition search is disabled.
    sf->reuse_inter_pred_sby = 1;
+  }

-    // Increase mode checking threshold for NEWMV.
-    sf->elevate_newmv_thresh = 2000;
-  }
  if (speed >= 7) {
-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
-    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
-    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
-        800 : 300;
-    sf->elevate_newmv_thresh = 2500;
-  }
-  if (speed >= 8) {
    int i;
    for (i = 0; i < BLOCK_SIZES; ++i)
      sf->inter_mode_mask[i] = INTER_NEAREST;
@@ -319,7 +310,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  sf->use_lp32x32fdct = 0;
  sf->adaptive_motion_search = 0;
  sf->adaptive_pred_interp_filter = 0;
-  sf->use_quant_fp = 0;
  sf->reference_masking = 0;
  sf->partition_search_type = SEARCH_PARTITION;
  sf->less_rectangular_check = 0;
@@ -357,8 +347,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  // to FIXED_PARTITION.
  sf->always_this_block_size = BLOCK_16X16;
  sf->search_type_check_frequency = 50;
-  sf->encode_breakout_thresh = 0;
-  sf->elevate_newmv_thresh = 0;
+  sf->source_var_thresh = 100;

  // Recode loop tolerence %.
  sf->recode_tolerance = 25;
@@ -402,8 +391,4 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  if (!cpi->oxcf.frame_periodic_boost) {
    sf->max_delta_qindex = 0;
  }
-
-  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
-      sf->encode_breakout_thresh > cpi->encode_breakout)
-    cpi->encode_breakout = sf->encode_breakout_thresh;
 }
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -73,8 +73,6 @@ typedef enum {
  LPF_PICK_FROM_SUBIMAGE,
  // Estimate the level based on quantizer and frame type
  LPF_PICK_FROM_Q,
-  // Pick 0 to disable LPF if LPF was enabled last frame
-  LPF_PICK_MINIMAL_LPF
 } LPF_PICK_METHOD;

 typedef enum {
@@ -284,9 +282,6 @@ typedef struct SPEED_FEATURES {
  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
  int adaptive_pred_interp_filter;

-  // Fast quantization process path
-  int use_quant_fp;
-
  // Search through variable block partition types in non-RD mode decision
  // encoding process for RTC.
  int partition_check;
@@ -356,17 +351,13 @@ typedef struct SPEED_FEATURES {
  // FIXED_PARTITION search type should be used.
  int search_type_check_frequency;

+  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+  unsigned int source_var_thresh;
+
  // When partition is pre-set, the inter prediction result from pick_inter_mode
  // can be reused in final block encoding process. It is enabled only for real-
  // time mode speed 6.
  int reuse_inter_pred_sby;
-
-  // This variable sets the encode_breakout threshold. Currently, it is only
-  // enabled in real time mode.
-  int encode_breakout_thresh;
-
-  // In real time encoding, increase the threshold for NEWMV.
-  int elevate_newmv_thresh;
 } SPEED_FEATURES;

 struct VP9_COMP;
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -334,3 +334,41 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
    *t = t_backup;
  }
 }
+
+#if CONFIG_SUPERTX
+void vp9_tokenize_sb_supertx(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+                             BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = vp9_get_skip_context(xd);
+  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
+                                              SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, xd, t};
+  int plane;
+  if (mbmi->skip) {
+    if (!dry_run)
+      cm->counts.skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run) {
+    cm->counts.skip[ctx][0] += skip_inc;
+    for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+      BLOCK_SIZE plane_size = plane ? (bsize - 3) : bsize;
+      tokenize_b(plane, 0, plane_size, b_width_log2(plane_size), &arg);
+    }
+  } else {
+    for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+      BLOCK_SIZE plane_size = plane ? (bsize - 3) : bsize;
+      set_entropy_context_b(plane, 0, plane_size, b_width_log2(plane_size),
+                            &arg);
+    }
+    *t = t_backup;
+  }
+}
+#endif
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -46,6 +46,10 @@ struct VP9_COMP;

 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                     BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp9_tokenize_sb_supertx(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+                             BLOCK_SIZE bsize);
+#endif

 extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -266,3 +266,98 @@ void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
    ref += ref_stride;
  }
 }
+
+#if ((CONFIG_MASKED_INTERINTRA && CONFIG_INTERINTRA) || \
+    CONFIG_MASKED_INTERINTER)
+void masked_variance(const uint8_t *a, int  a_stride,
+                     const uint8_t *b, int  b_stride,
+                     const uint8_t *m, int  m_stride,
+                     int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = (a[j] - b[j]) * (m[j]);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  *sum = (*sum >= 0) ? ((*sum + 31) >> 6) : -((-*sum + 31) >> 6);
+  *sse = (*sse + 2047) >> 12;
+}
+
+#define MASK_VAR(W, H) \
+unsigned int vp9_masked_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                              const uint8_t *b, int b_stride, \
+                                              const uint8_t *m, int m_stride, \
+                                              unsigned int *sse) { \
+  int sum; \
+  masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define MASK_SUBPIX_VAR(W, H) \
+unsigned int vp9_masked_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  const uint8_t *msk, int msk_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+                                    BILINEAR_FILTERS_2TAP(xoffset)); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \
+                                          msk, msk_stride, sse); \
+}
+
+MASK_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 4)
+
+MASK_VAR(4, 8)
+MASK_SUBPIX_VAR(4, 8)
+
+MASK_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 4)
+
+MASK_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 8)
+
+MASK_VAR(8, 16)
+MASK_SUBPIX_VAR(8, 16)
+
+MASK_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 8)
+
+MASK_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 16)
+
+MASK_VAR(16, 32)
+MASK_SUBPIX_VAR(16, 32)
+
+MASK_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 16)
+
+MASK_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 32)
+
+MASK_VAR(32, 64)
+MASK_SUBPIX_VAR(32, 64)
+
+MASK_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 32)
+
+MASK_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 64)
+#endif
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -67,6 +67,32 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
                                                   unsigned int *sse,
                                                   const uint8_t *second_pred);

+#if ((CONFIG_MASKED_INTERINTRA && CONFIG_INTERINTRA) || \
+    CONFIG_MASKED_INTERINTER)
+typedef unsigned int(*vp9_masked_sad_fn_t)(const uint8_t *src_ptr,
+                                           int source_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride,
+                                           const uint8_t *msk_ptr,
+                                           int msk_stride);
+typedef unsigned int (*vp9_masked_variance_fn_t)(const uint8_t *src_ptr,
+                                                 int source_stride,
+                                                 const uint8_t *ref_ptr,
+                                                 int ref_stride,
+                                                 const uint8_t *msk_ptr,
+                                                 int msk_stride,
+                                                 unsigned int *sse);
+typedef unsigned int (*vp9_masked_subpixvariance_fn_t)(const uint8_t *src_ptr,
+                                                       int source_stride,
+                                                       int xoffset,
+                                                       int yoffset,
+                                                       const uint8_t *ref_ptr,
+                                                       int Refstride,
+                                                       const uint8_t *msk_ptr,
+                                                       int msk_stride,
+                                                       unsigned int *sse);
+#endif
+
 typedef struct vp9_variance_vtable {
  vp9_sad_fn_t               sdf;
  vp9_sad_avg_fn_t           sdaf;
@@ -76,6 +102,12 @@ typedef struct vp9_variance_vtable {
  vp9_sad_multi_fn_t         sdx3f;
  vp9_sad_multi_fn_t         sdx8f;
  vp9_sad_multi_d_fn_t       sdx4df;
+#if ((CONFIG_MASKED_INTERINTRA && CONFIG_INTERINTRA) || \
+    CONFIG_MASKED_INTERINTER)
+  vp9_masked_sad_fn_t            msdf;
+  vp9_masked_variance_fn_t       mvf;
+  vp9_masked_subpixvariance_fn_t msvf;
+#endif
 } vp9_variance_fn_ptr_t;

 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -217,185 +217,3 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 INIT_XMM ssse3
 QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
-
-%macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-; TODO(jingning) to be continued with 32x32 quantization process
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  mov                             r3, qcoeffmp
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-  pcmpeqw                        m12, m12
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-  pcmpeqw                        m12, m12
-%ifidn %1, b_32x32
-  pmovmskb                        r6, m7
-  pmovmskb                        r2, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FP fp, 7
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -90,8 +90,6 @@ struct vpx_codec_alg_priv {
  vp8_postproc_cfg_t      preview_ppcfg;
  vpx_codec_pkt_list_decl(128) pkt_list;
  unsigned int                 fixed_kf_cntr;
-  // BufferPool that holds all reference frames.
-  BufferPool              *buffer_pool;
 };

 static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
@@ -632,10 +630,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
    ctx->priv->alg_priv = priv;
    ctx->priv->init_flags = ctx->init_flags;
    ctx->priv->enc.total_encoders = 1;
-    ctx->priv->alg_priv->buffer_pool =
-        (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
-    if (ctx->priv->alg_priv->buffer_pool == NULL)
-      return VPX_CODEC_MEM_ERROR;

    if (ctx->config.enc) {
      // Update the reference to the config structure to an internal copy.
@@ -673,8 +667,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
      set_encoder_config(&ctx->priv->alg_priv->oxcf,
                         &ctx->priv->alg_priv->cfg,
                         &ctx->priv->alg_priv->extra_cfg);
-      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf,
-                                  ctx->priv->alg_priv->buffer_pool);
+      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
      if (cpi == NULL)
        res = VPX_CODEC_MEM_ERROR;
      else
@@ -688,7 +681,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
  free(ctx->cx_data);
  vp9_remove_compressor(ctx->cpi);
-  vpx_free(ctx->buffer_pool);
  free(ctx);
  return VPX_CODEC_OK;
 }
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -46,9 +46,6 @@ struct vpx_codec_alg_priv {
  int                     next_submit_thread_id;
  int                     next_output_thread_id;

-  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
-  BufferPool              *buffer_pool;
-
  // External frame buffer info to save for VP9 common.
  void *ext_priv;  // Private data associated with the external frame buffers.
  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
@@ -103,7 +100,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
  }

  vpx_free(ctx->frame_workers);
-  vpx_free(ctx->buffer_pool);
  vpx_free(ctx);

  return VPX_CODEC_OK;
@@ -222,22 +218,21 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
    VP9Worker *const worker = &ctx->frame_workers[i];
    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
    VP9_COMMON *const cm = &worker_data->pbi->common;
-    BufferPool *const pool = cm->buffer_pool;

    cm->new_fb_idx = -1;
    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-      pool->get_fb_cb = ctx->get_ext_fb_cb;
-      pool->release_fb_cb = ctx->release_ext_fb_cb;
-      pool->cb_priv = ctx->ext_priv;
+      cm->get_fb_cb = ctx->get_ext_fb_cb;
+      cm->release_fb_cb = ctx->release_ext_fb_cb;
+      cm->cb_priv = ctx->ext_priv;
    } else {
-      pool->get_fb_cb = vp9_get_frame_buffer;
-      pool->release_fb_cb = vp9_release_frame_buffer;
+      cm->get_fb_cb = vp9_get_frame_buffer;
+      cm->release_fb_cb = vp9_release_frame_buffer;

-      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+      if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                           "Failed to initialize internal frame buffers");

-      pool->cb_priv = &pool->int_frame_buffers;
+      cm->cb_priv = &cm->int_frame_buffers;
    }
  }
 }
@@ -270,16 +265,12 @@ static int frame_worker_hook(void *arg1, void *arg2) {

 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
  int i;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

  ctx->last_show_frame = -1;
  ctx->next_submit_thread_id = 0;
  ctx->next_output_thread_id = 0;
  ctx->num_frame_workers =
      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
-  ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
-  if (ctx->buffer_pool == NULL)
-    return VPX_CODEC_MEM_ERROR;

  ctx->frame_workers = (VP9Worker *)
      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
@@ -291,21 +282,19 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
  for (i = 0; i < ctx->num_frame_workers; ++i) {
    VP9Worker *const worker = &ctx->frame_workers[i];
    FrameWorkerData *worker_data = NULL;
-    winterface->init(worker);
+    vp9_worker_init(worker);
    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
    if (worker->data1 == NULL) {
      set_error_detail(ctx, "Failed to allocate worker_data");
      return VPX_CODEC_MEM_ERROR;
    }
    worker_data = (FrameWorkerData *)worker->data1;
-    worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+    worker_data->pbi = vp9_decoder_create();
    if (worker_data->pbi == NULL) {
      set_error_detail(ctx, "Failed to allocate worker_data");
      return VPX_CODEC_MEM_ERROR;
    }
-    worker_data->pbi->owner_frame_worker = worker;
-    worker_data->worker_id = i;
-    worker_data->frame_context_ready = 0;
+
    // If decoding in serial mode, FrameWorker thread could create tile worker
    // thread or loopfilter thread.
    worker_data->pbi->max_threads =
@@ -331,7 +320,6 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                  const uint8_t **data, unsigned int data_sz,
                                  void *user_priv, int64_t deadline) {
  vp9_ppflags_t flags = {0};
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  (void)deadline;

  // Determine the stream parameters. Note that we rely on peek_si to
@@ -367,14 +355,12 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
    worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
    worker_data->pbi->decrypt_state = ctx->decrypt_state;

-    worker->had_error = 0;
-    winterface->execute(worker);
+    vp9_worker_execute(worker);
+    if (worker->had_error)
+      return update_error_state(ctx, &worker_data->pbi->common.error);

    // Update data pointer after decode.
    *data = worker_data->data_end;
-
-    if (worker->had_error)
-      return update_error_state(ctx, &worker_data->pbi->common.error);
  } else {
    // TODO(hkuang): Implement frame parallel decode.
    return VPX_CODEC_INCAPABLE;
@@ -573,25 +559,17 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
    if (vp9_get_raw_frame(worker_data->pbi, &sd, &flags) == 0) {
      VP9_COMMON *const cm = &worker_data->pbi->common;
-      BufferPool *const pool = cm->buffer_pool;
-      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
      yuvconfig2image(&ctx->img, &sd, worker_data->user_priv);
-      ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
      img = &ctx->img;
      *iter = img;
      // Decrease reference count of last output frame in frame parallel mode.
      if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-#if CONFIG_MULTITHREAD
-        pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
-#endif
-        --frame_bufs[ctx->last_show_frame].ref_count;
-        if (frame_bufs[ctx->last_show_frame].ref_count == 0) {
-          pool->release_fb_cb(pool->cb_priv,
-              &frame_bufs[ctx->last_show_frame].raw_frame_buffer);
+        --cm->frame_bufs[ctx->last_show_frame].ref_count;
+        if (cm->frame_bufs[ctx->last_show_frame].ref_count == 0) {
+          cm->release_fb_cb(cm->cb_priv,
+              &cm->frame_bufs[ctx->last_show_frame].raw_frame_buffer);
        }
-#if CONFIG_MULTITHREAD
-        pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
-#endif
      }
      ctx->last_show_frame = worker_data->pbi->common.new_fb_idx;
    }
@@ -679,7 +657,7 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
    VP9Worker *const worker = ctx->frame_workers;
    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
    vp9_get_reference_dec(worker_data->pbi, data->idx, &fb);
-    yuvconfig2image(&data->img, fb, NULL);
+    yuvconfig2image(&data->img, fb, worker_data->user_priv);
    return VPX_CODEC_OK;
  } else {
    return VPX_CODEC_INVALID_PARAM;
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -31,7 +31,6 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
    img->fmt = VPX_IMG_FMT_I420;
    bps = 12;
  }
-  img->bit_depth = 8;
  img->w = yv12->y_stride;
  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
  img->d_w = yv12->y_crop_width;
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -59,11 +59,14 @@ typedef struct FrameData {
 typedef struct SvcInternal {
  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
  char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
+  char quantizers_keyframe[OPTION_BUFFER_SIZE];  // set by
+                                                 // vpx_svc_set_quantizers
  char scale_factors[OPTION_BUFFER_SIZE];  // set by vpx_svc_set_scale_factors

  // values extracted from option, quantizers
  int scaling_factor_num[VPX_SS_MAX_LAYERS];
  int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  int quantizer_keyframe[VPX_SS_MAX_LAYERS];
  int quantizer[VPX_SS_MAX_LAYERS];

  // accumulated statistics
@@ -194,8 +197,26 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level,
  return retval;
 }

+static vpx_codec_err_t set_option_encoding_mode(SvcContext *svc_ctx,
+                                                const char *value_str) {
+  if (strcmp(value_str, "i") == 0) {
+    svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_I;
+  } else if (strcmp(value_str, "alt-ip") == 0) {
+    svc_ctx->encoding_mode = ALT_INTER_LAYER_PREDICTION_IP;
+  } else if (strcmp(value_str, "ip") == 0) {
+    svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_IP;
+  } else if (strcmp(value_str, "gf") == 0) {
+    svc_ctx->encoding_mode = USE_GOLDEN_FRAME;
+  } else {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "invalid encoding mode: %s", value_str);
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
-                                              const char *quantizer_values) {
+                                              const char *quantizer_values,
+                                              const int is_keyframe) {
  char *input_string;
  char *token;
  const char *delim = ",";
@@ -206,6 +227,11 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
  SvcInternal *const si = get_svc_internal(svc_ctx);

  if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
+    if (is_keyframe) {
+      // If there non settings for key frame, we will apply settings from
+      // non key frame. So just simply return here.
+      return VPX_CODEC_INVALID_PARAM;
+    }
    input_string = strdup(DEFAULT_QUANTIZER_VALUES);
  } else {
    input_string = strdup(quantizer_values);
@@ -226,7 +252,12 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
    } else {
      q = 0;
    }
-    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
+    if (is_keyframe) {
+      si->quantizer_keyframe[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers]
+      = q;
+    } else {
+      si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
+    }
  }
  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
    svc_log(svc_ctx, SVC_LOG_ERROR,
@@ -311,6 +342,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
  char *option_name;
  char *option_value;
  char *input_ptr;
+  int is_keyframe_qaunt_set = 0;
  vpx_codec_err_t res = VPX_CODEC_OK;

  if (options == NULL) return VPX_CODEC_OK;
@@ -327,14 +359,26 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
      res = VPX_CODEC_INVALID_PARAM;
      break;
    }
-    if (strcmp("layers", option_name) == 0) {
+    if (strcmp("encoding-mode", option_name) == 0) {
+      res = set_option_encoding_mode(svc_ctx, option_value);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("layers", option_name) == 0) {
      svc_ctx->spatial_layers = atoi(option_value);
    } else if (strcmp("scale-factors", option_name) == 0) {
      res = parse_scale_factors(svc_ctx, option_value);
      if (res != VPX_CODEC_OK) break;
    } else if (strcmp("quantizers", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value);
+      res = parse_quantizer_values(svc_ctx, option_value, 0);
      if (res != VPX_CODEC_OK) break;
+      if (!is_keyframe_qaunt_set) {
+        SvcInternal *const si = get_svc_internal(svc_ctx);
+        memcpy(get_svc_internal(svc_ctx)->quantizer_keyframe, si->quantizer,
+               sizeof(si->quantizer));
+      }
+    } else if (strcmp("quantizers-keyframe", option_name) == 0) {
+      res = parse_quantizer_values(svc_ctx, option_value, 1);
+      if (res != VPX_CODEC_OK) break;
+      is_keyframe_qaunt_set = 1;
    } else {
      svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
      res = VPX_CODEC_INVALID_PARAM;
@@ -357,13 +401,19 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
 }

 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizers) {
+                                       const char *quantizers,
+                                       const int is_for_keyframe) {
  SvcInternal *const si = get_svc_internal(svc_ctx);
  if (svc_ctx == NULL || quantizers == NULL || si == NULL) {
    return VPX_CODEC_INVALID_PARAM;
  }
-  strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
-  si->quantizers[sizeof(si->quantizers) - 1] = '\0';
+  if (is_for_keyframe) {
+    strncpy(si->quantizers_keyframe, quantizers, sizeof(si->quantizers));
+    si->quantizers_keyframe[sizeof(si->quantizers_keyframe) - 1] = '\0';
+  } else {
+    strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
+    si->quantizers[sizeof(si->quantizers) - 1] = '\0';
+  }
  return VPX_CODEC_OK;
 }

@@ -410,9 +460,13 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    return VPX_CODEC_INVALID_PARAM;
  }

-  res = parse_quantizer_values(svc_ctx, si->quantizers);
+  res = parse_quantizer_values(svc_ctx, si->quantizers, 0);
  if (res != VPX_CODEC_OK) return res;

+  res = parse_quantizer_values(svc_ctx, si->quantizers_keyframe, 1);
+  if (res != VPX_CODEC_OK)
+    memcpy(si->quantizer_keyframe, si->quantizer, sizeof(si->quantizer));
+
  res = parse_scale_factors(svc_ctx, si->scale_factors);
  if (res != VPX_CODEC_OK) return res;

@@ -571,14 +625,62 @@ static void calculate_enc_frame_flags(SvcContext *svc_ctx) {
    return;
  }

-  if (si->layer == 0) {
-    flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
-  } else if (is_keyframe) {
-    flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
-  } else {
-    flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
+  switch (svc_ctx->encoding_mode) {
+    case ALT_INTER_LAYER_PREDICTION_IP:
+      if (si->layer == 0) {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      } else if (is_keyframe) {
+        if (si->layer == si->layers - 1) {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+        } else {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF);
+        }
+      } else {
+        flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
+      }
+      break;
+    case INTER_LAYER_PREDICTION_I:
+      if (si->layer == 0) {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      } else if (is_keyframe) {
+        flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+      } else {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      }
+      break;
+    case INTER_LAYER_PREDICTION_IP:
+      if (si->layer == 0) {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      } else if (is_keyframe) {
+        flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+      } else {
+        flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
+      }
+      break;
+    case USE_GOLDEN_FRAME:
+      if (2 * si->layers - SVC_REFERENCE_FRAMES <= si->layer) {
+        if (si->layer == 0) {
+          flags = map_vp8_flags(USE_LAST | USE_GF | UPDATE_LAST);
+        } else if (is_keyframe) {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF);
+        } else {
+          flags = map_vp8_flags(USE_LAST | USE_ARF | USE_GF | UPDATE_LAST);
+        }
+      } else {
+        if (si->layer == 0) {
+          flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+        } else if (is_keyframe) {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+        } else {
+          flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+        }
+      }
+      break;
+    default:
+      svc_log(svc_ctx, SVC_LOG_ERROR, "unexpected encoding mode: %d\n",
+              svc_ctx->encoding_mode);
+      break;
  }
-
  si->enc_frame_flags = flags;
 }

@@ -624,6 +726,13 @@ static void set_svc_parameters(SvcContext *svc_ctx,
  svc_params.flags = si->enc_frame_flags;

  layer = si->layer;
+  if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+      si->frame_within_gop == 0) {
+    // layers 1 & 3 don't exist in this mode, use the higher one
+    if (layer == 0 || layer == 2) {
+      layer += 1;
+    }
+  }
  if (VPX_CODEC_OK != vpx_svc_get_layer_resolution(svc_ctx, layer,
                                                   &svc_params.width,
                                                   &svc_params.height)) {
@@ -632,8 +741,13 @@ static void set_svc_parameters(SvcContext *svc_ctx,
  layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;

  if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) {
-    svc_params.min_quantizer = si->quantizer[layer_index];
-    svc_params.max_quantizer = si->quantizer[layer_index];
+    if (vpx_svc_is_keyframe(svc_ctx)) {
+      svc_params.min_quantizer = si->quantizer_keyframe[layer_index];
+      svc_params.max_quantizer = si->quantizer_keyframe[layer_index];
+    } else {
+      svc_params.min_quantizer = si->quantizer[layer_index];
+      svc_params.max_quantizer = si->quantizer[layer_index];
+    }
  } else {
    svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer;
    svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;
@@ -645,8 +759,21 @@ static void set_svc_parameters(SvcContext *svc_ctx,
  svc_params.lst_fb_idx = si->layer;

  // Use buffer i-1 for layer i Alt (Inter-layer prediction)
-  svc_params.alt_fb_idx = (si->layer > 0) ? si->layer - 1 : 0;
-  svc_params.gld_fb_idx = svc_params.lst_fb_idx;
+  if (si->layer != 0) {
+    const int use_higher_layer =
+        svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+        si->frame_within_gop == 0;
+    svc_params.alt_fb_idx = use_higher_layer ? si->layer - 2 : si->layer - 1;
+  }
+
+  if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP) {
+    svc_params.gld_fb_idx = si->layer + 1;
+  } else {
+    if (si->layer < 2 * si->layers - SVC_REFERENCE_FRAMES)
+      svc_params.gld_fb_idx = svc_params.lst_fb_idx;
+    else
+      svc_params.gld_fb_idx = 2 * si->layers - 1 - si->layer;
+  }

  svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, layer: %d, %dx%d, q: %d\n",
          si->encode_frame_count, si->layer, svc_params.width,
@@ -705,6 +832,11 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
  if (rawimg != NULL) {
    // encode each layer
    for (si->layer = 0; si->layer < si->layers; ++si->layer) {
+      if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+          si->is_keyframe && (si->layer == 1 || si->layer == 3)) {
+        svc_log(svc_ctx, SVC_LOG_DEBUG, "Skip encoding layer %d\n", si->layer);
+        continue;
+      }
      calculate_enc_frame_flags(svc_ctx);
      set_svc_parameters(svc_ctx, codec_ctx);
    }
@@ -843,7 +975,7 @@ static double calc_psnr(double d) {

 // dump accumulated statistics and reset accumulated values
 const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
-  int number_of_frames, encode_frame_count;
+  int number_of_frames, number_of_keyframes, encode_frame_count;
  int i, j;
  uint32_t bytes_total = 0;
  double scale[COMPONENTS];
@@ -860,9 +992,14 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
  if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx);

  svc_log(svc_ctx, SVC_LOG_INFO, "\n");
+  number_of_keyframes = encode_frame_count / si->kf_dist + 1;
  for (i = 0; i < si->layers; ++i) {
    number_of_frames = encode_frame_count;

+    if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+        (i == 1 || i == 3)) {
+      number_of_frames -= number_of_keyframes;
+    }
    svc_log(svc_ctx, SVC_LOG_INFO,
            "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
            i, (double)si->psnr_sum[i][0] / number_of_frames,
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -40,13 +40,13 @@ static void img_buf_free(void *memblk) {
  }
 }

-static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
-                                     vpx_img_fmt_t  fmt,
-                                     unsigned int   d_w,
-                                     unsigned int   d_h,
-                                     unsigned int   buf_align,
-                                     unsigned int   stride_align,
-                                     unsigned char *img_data) {
+static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
+                                     vpx_img_fmt_t fmt,
+                                     unsigned int  d_w,
+                                     unsigned int  d_h,
+                                     unsigned int  buf_align,
+                                     unsigned int  stride_align,
+                                     unsigned char      *img_data) {

  unsigned int  h, w, s, xcs, ycs, bps;
  int           align;
@@ -94,21 +94,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
    case VPX_IMG_FMT_VPXYV12:
      bps = 12;
      break;
-    case VPX_IMG_FMT_I422:
-      bps = 16;
-      break;
-    case VPX_IMG_FMT_I444:
-      bps = 24;
-      break;
-    case VPX_IMG_FMT_I42016:
-      bps = 24;
-      break;
-    case VPX_IMG_FMT_I42216:
-      bps = 32;
-      break;
-    case VPX_IMG_FMT_I44416:
-      bps = 48;
-      break;
    default:
      bps = 16;
      break;
@@ -120,9 +105,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
    case VPX_IMG_FMT_YV12:
    case VPX_IMG_FMT_VPXI420:
    case VPX_IMG_FMT_VPXYV12:
-    case VPX_IMG_FMT_I422:
-    case VPX_IMG_FMT_I42016:
-    case VPX_IMG_FMT_I42216:
      xcs = 1;
      break;
    default:
@@ -174,7 +156,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
    goto fail;

  img->fmt = fmt;
-  img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8;
  img->w = w;
  img->h = h;
  img->x_chroma_shift = xcs;
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -23,6 +23,13 @@
 extern "C" {
 #endif

+typedef enum SVC_ENCODING_MODE {
+  INTER_LAYER_PREDICTION_I,
+  ALT_INTER_LAYER_PREDICTION_IP,
+  INTER_LAYER_PREDICTION_IP,
+  USE_GOLDEN_FRAME
+} SVC_ENCODING_MODE;
+
 typedef enum SVC_LOG_LEVEL {
  SVC_LOG_ERROR,
  SVC_LOG_INFO,
@@ -32,6 +39,7 @@ typedef enum SVC_LOG_LEVEL {
 typedef struct {
  // public interface to svc_command options
  int spatial_layers;               // number of layers
+  SVC_ENCODING_MODE encoding_mode;  // svc encoding strategy
  SVC_LOG_LEVEL log_level;  // amount of information to display
  int log_print;  // when set, printf log messages instead of returning the
                  // message with svc_get_message
@@ -56,7 +64,8 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options);
 * e.g., "60,53,39,33,27"
 */
 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizer_values);
+                                       const char *quantizer_values,
+                                       const int is_for_keyframe);

 /**
 * Set SVC scale factors
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -212,15 +212,6 @@ extern "C" {
    vpx_codec_priv_t        *priv;        /**< Algorithm private storage */
  } vpx_codec_ctx_t;

-  /*!\brief Bit depth for codec
-   * *
-   * This enumeration determines the bit depth of the codec.
-   */
-  typedef enum vpx_bit_depth {
-    VPX_BITS_8,   /**< 8 bits  */
-    VPX_BITS_10,  /**< 10 bits */
-    VPX_BITS_12   /**< 12 bits */
-  } vpx_bit_depth_t;

  /*
   * Library Version Number Interface
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -103,9 +103,8 @@ extern "C" {
    vpx_img_fmt_t fmt; /**< Image Format */

    /* Image storage dimensions */
-    unsigned int  w;           /**< Stored image width */
-    unsigned int  h;           /**< Stored image height */
-    unsigned int  bit_depth;   /**< Stored image bit-depth */
+    unsigned int  w;   /**< Stored image width */
+    unsigned int  h;   /**< Stored image height */

    /* Image display dimensions */
    unsigned int  d_w;   /**< Displayed image width */
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -895,8 +895,7 @@ int main_loop(int argc, const char **argv_) {
            len = y4m_write_file_header(buf, sizeof(buf),
                                        vpx_input_ctx.width,
                                        vpx_input_ctx.height,
-                                        &vpx_input_ctx.framerate,
-                                        img->fmt, 8);
+                                        &vpx_input_ctx.framerate, img->fmt);
            if (do_md5) {
              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
            } else {
@@ -934,6 +933,8 @@ int main_loop(int argc, const char **argv_) {
      }
    }

+    if (stop_after && frame_in >= stop_after)
+      break;
  }

  if (summary || progress) {
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -756,7 +756,6 @@ void open_input_file(struct VpxInputContext *input) {
      input->framerate.numerator = input->y4m.fps_n;
      input->framerate.denominator = input->y4m.fps_d;
      input->fmt = input->y4m.vpx_fmt;
-      input->bit_depth = input->y4m.bit_depth;
    } else
      fatal("Unsupported Y4M stream.");
  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
@@ -1534,7 +1533,6 @@ int main(int argc, const char **argv_) {
  input.framerate.numerator = 30;
  input.framerate.denominator = 1;
  input.only_i420 = 1;
-  input.bit_depth = 0;

  /* First parse the global configuration values, because we want to apply
   * other parameters on top of the default configuration provided by the
--- a/y4menc.c
+++ b/y4menc.c
@@ -8,48 +8,16 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <assert.h>
 #include "./y4menc.h"

 int y4m_write_file_header(char *buf, size_t len, int width, int height,
                          const struct VpxRational *framerate,
-                          vpx_img_fmt_t fmt, unsigned int bit_depth) {
-  const char *color;
-  switch (bit_depth) {
-    case 8:
-      color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
-              fmt == VPX_IMG_FMT_I444 ? "C444\n" :
-              fmt == VPX_IMG_FMT_I422 ? "C422\n" :
-              "C420jpeg\n";
-      break;
-    case 9:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" :
-              "C420p9 XYSCSS=420P9\n";
-      break;
-    case 10:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" :
-              "C420p10 XYSCSS=420P10\n";
-      break;
-    case 12:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" :
-              "C420p12 XYSCSS=420P12\n";
-      break;
-    case 14:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" :
-              "C420p14 XYSCSS=420P14\n";
-      break;
-    case 16:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" :
-              "C420p16 XYSCSS=420P16\n";
-      break;
-    default:
-      assert(0);
-  }
+                          vpx_img_fmt_t fmt) {
+  const char *const color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
+                            fmt == VPX_IMG_FMT_I444 ? "C444\n" :
+                            fmt == VPX_IMG_FMT_I422 ? "C422\n" :
+                            "C420jpeg\n";
+
  return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,
                  framerate->numerator, framerate->denominator, 'p', color);
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Deb Mukherjee	6b0c636b96	Cosmetic changes in the supertx expt Converts most negative !CONFIG_SUPERTX checks to positive ones. Change-Id: I80b7f8c5d3483a7861f0de7fc7ebc425b9c68766	2014-11-10 17:34:00 -08:00
Yue Chen	b0aa2db2e7	Fixing rd loop bugs in supertx+ext_tx experiment Change-Id: I891b108e591e01d5c7d588dec0bcc4b323d0b6a8	2014-11-07 16:43:12 -08:00
Yue Chen	95d0f87d6e	Fixing sub-optimal rdloop when testing supertx on 8x4/4x8 blocks Remove early termination in vp9_rd_pick_inter_mode_sub8x8() in order to complete mode selection for 8x4/4x8 blocks which will try supertx in a higher level function. Change-Id: I457505257332f70d9cd8d22db52ad32ff15f7f87	2014-10-20 21:52:55 -07:00
Yue Chen	c741a4fe03	Fixing skip flag bugs in recent experiments Bugs were in vp9_rdopt.c Also did minor clean-ups in vp9_encodeframe.c Change-Id: I6fec18e349cd0b810b0772e506927b423db077b6	2014-10-20 14:51:20 -07:00
Deb Mukherjee	430c389f16	Miscellaneous fixes for recent experiments Various cleanups for ext-tx, supertx, copy-coding experiments. Change-Id: I8703d5fee57b1310d8d1aa1f26908e9a427b0502	2014-10-14 15:59:17 -07:00
Yue Chen	d8b0d40bf6	Allow blocks to directly copy motion information from neighbors A new set of prediction modes, called copy modes, is implemented to allow blocks to directly copy motion information from a neighbor. The motivation is to create regions of arbitrary shapes in which blocks share same motion parameters and hence to save bits spent on duplicated side information. Compression gain: derf: 0.894%; stdhd: 1.513%. Change-Id: I5e026b12c902bc6985c199ec38f1b3b67ac7d930	2014-09-03 11:22:36 -07:00
Yue Chen	a4dfcd9a2d	Implementing transform overlapping multiple blocks We removed the restriction that transform blocks could not exceed the size of prediction blocks. Smoothing masks are applied to reduce discontinuity between prediction blocks in order to realize the efficiency of large transform. 0.997%/0.895% bit-rate reduction is achieved on derf/stdhd set. Change-Id: I8db241bab9fe74d864809e95f76b771ee59a2def	2014-08-15 16:56:11 -07:00
Yue Chen	be17f1b338	Merge "Migrating old experiments into new playground branch and speedup" into playground	2014-06-26 15:36:44 -07:00
Yue Chen	ffdad39324	Merge "Squash commits from master to playground" into playground	2014-06-26 15:36:24 -07:00
Yue Chen	07ac101806	Migrating old experiments into new playground branch and speedup The old interintra experiment is slow (speed is 0.2x original codec at speed 0). We use best inter mode to skip some reference frame and NEWMV search when searching best joint mode. Quality drop: ~0.1% derf. Speed: 0.36x head. Change-Id: If10453448284f86c14a0a41f20aeaf9ac838fa32	2014-06-26 14:22:53 -07:00
Yue Chen	a49d80bfc8	Squash commits from master to playground Moving RD-opt related code from vp9_encoder.h to vp9_rdopt.h. Squashed-Change-Id: I8fab776c8801e19d3f5027ed55a6aa69eee951de gen_msvs_proj: fix in tree configure under cygwin strip trailing '/' from paths, this is later converted to '\' which causes execution errors for obj_int_extract/yasm. vs10+ wasn't affected by this issue, but make the same change for consistency. gen_msvs_proj: + add missing '"' to obj_int_extract call unlike gen_msvs_vcproj, the block is duplicated missed in: `1e3d9b9` build/msvs: fix builds in source dirs with spaces Squashed-Change-Id: I76208e6cdc66dc5a0a7ffa8aa1edbefe31e4b130 Improve vp9_rb_bytes_read Squashed-Change-Id: I69eba120eb3d8ec43b5552451c8a9bd009390795 Removing decode_one_iter() function. When superframe index is available we completely rely on it and use frame size values from the index. Squashed-Change-Id: I0011d08b223303a8b912c2bcc8a02b74d0426ee0 iosbuild.sh: Add vpx_config.h and vpx_version.h to VPX.framework. - Rename build_targets to build_framework - Add functions for creating the vpx_config shim and obtaining preproc symbols. Squashed-Change-Id: Ieca6938b9779077eefa26bf4cfee64286d1840b0 Implemented vp9_denoiser_{alloc,free}() Squashed-Change-Id: I79eba79f7c52eec19ef2356278597e06620d5e27 Update running avg for VP9 denoiser Squashed-Change-Id: I9577d648542064052795bf5770428fbd5c276b7b Changed buf_2ds in vp9 denoiser to YV12 buffers Changed alloc, free, and running average code as necessary. Squashed-Change-Id: Ifc4d9ccca462164214019963b3768a457791b9c1 sse4 regular quantize Squashed-Change-Id: Ibd95df0adf9cc9143006ee9032b4cb2ebfd5dd1b Modify non-rd intra mode checking Speed 6 uses small tx size, namely 8x8. max_intra_bsize needs to be modified accordingly to ensure valid intra mode checking. Borg test on RTC set showed an overall PSNR gain of 0.335% in speed -6. This also changes speed -5 encoding by allowing DC_PRED checking for block32x32. Borg test on RTC set showed a slight PSNR gain of 0.145%, and no noticeable speed change. Squashed-Change-Id: I1502978d8fbe265b3bb235db0f9c35ba0703cd45 Implemented COPY_BLOCK case for vp9 denoiser Squashed-Change-Id: Ie89ad1e3aebbd474e1a0db69c1961b4d1ddcd33e Improved vp9 denoiser running avg update. Squashed-Change-Id: Ie0aa41fb7957755544321897b3bb2dd92f392027 Separate rate-distortion modeling for DC and AC coefficients This is the first step to rework the rate-distortion modeling used in rtc coding mode. The overall goal is to make the modeling customized for the statistics encountered in the rtc coding. This commit makes encoder to perform rate-distortion modeling for DC and AC coefficients separately. No speed changes observed. The coding performance for pedestrian_area_1080p is largely improved: speed -5, from 79558 b/f, 37.871 dB -> 79598 b/f, 38.600 dB speed -6, from 79515 b/f, 37.822 dB -> 79544 b/f, 38.130 dB Overall performance for rtc set at speed -6 is improved by 0.67%. Squashed-Change-Id: I9153444567e5f75ccdcaac043c2365992c005c0c Add superframe support for frame parallel decoding. A superframe is a bunch of frames that bundled as one frame. It is mostly used to combine one or more non-displayable frames and one displayable frame. For frame parallel decoding, libvpx decoder will only support decoding one normal frame or a super frame with superframe index. If an application pass a superframe without superframe index or a chunk of displayable frames without superframe index to libvpx decoder, libvpx will not decode it in frame parallel mode. But libvpx decoder still could decode it in serial mode. Squashed-Change-Id: I04c9f2c828373d64e880a8c7bcade5307015ce35 Fixes in VP9 alloc, free, and COPY_FRAME case Squashed-Change-Id: I1216f17e2206ef521fe219b6d72d8e41d1ba1147 Remove labels from quantize Use break instead of goto for early exit. Unbreaks Visual Studio builds. Squashed-Change-Id: I96dee43a3c82145d4abe0d6a99af6e6e1a3991b5 Added CFLAG for outputting vp9 denoised signal Squashed-Change-Id: Iab9b4e11cad927f3282e486c203564e1a658f377 Allow key frame more flexibility in mode search This commit allows the key frame to search through more prediction modes and more flexible block sizes. No speed change observed. The coding performance for rtc set is improved by 1.7% for speed -5 and 3.0% for speed -6. Squashed-Change-Id: Ifd1bc28558017851b210b4004f2d80838938bcc5 VP9 denoiser bugfixes s/stdint.h/vpx\/vpx_int.h Added missing 'break;'s Also included other minor changes, mostly cosmetic. Squashed-Change-Id: I852bba3e85e794f1d4af854c45c16a23a787e6a3 Don't return value for void functions Clears "warning: 'return' with a value, in function returning void" Squashed-Change-Id: I93972610d67e243ec772a1021d2fdfcfc689c8c2 Include type defines Clears error: unknown type name 'uint8_t' Squashed-Change-Id: I9b6eff66a5c69bc24aeaeb5ade29255a164ef0e2 Validate error checking code in decoder. This patch adds a mechanism for insuring error checking on invalid files by creating a unit test that runs the decoder and tests that the error code matches what's expected on each frame in the decoder. Disabled for now as this unit test will segfault with existing code. Squashed-Change-Id: I896f9686d9ebcbf027426933adfbea7b8c5d956e Introduce FrameWorker for decoding. When decoding in serial mode, there will be only one FrameWorker doing decoding. When decoding in parallel mode, there will be several FrameWorkers doing decoding in parallel. Squashed-Change-Id: If53fc5c49c7a0bf5e773f1ce7008b8a62fdae257 Add back libmkv ebml writer files. Another project in ChromeOS is using these files. To make libvpx rolls simpler, add these files back unitl the other project removes the dependency. crbug.com/387246 tracking bug to remove dependency. Squashed-Change-Id: If9c197081c845c4a4e5c5488d4e0190380bcb1e4 Added Test vector that tests more show existing frames. Squashed-Change-Id: I0ddd7dd55313ee62d231ed4b9040e08c3761b3fe fix peek_si to enable 1 byte show existing frames. The test for this is in test vector code ( show existing frames will fail ). I can't check it in disabled as I'm changing the generic test code to do this: https://gerrit.chromium.org/gerrit/#/c/70569/ Squashed-Change-Id: I5ab324f0cb7df06316a949af0f7fc089f4a3d466 Fix bug in error handling that causes segfault See: https://code.google.com/p/chromium/issues/detail?id=362697 The code properly catches an invalid stream but seg faults instead of returning an error due to a buffer not having been initialized. This code fixes that. Squashed-Change-Id: I695595e742cb08807e1dfb2f00bc097b3eae3a9b Revert 3 patches from Hangyu to get Chrome to build: Avoids failures: MSE_ClearKey/EncryptedMediaTest.Playback_VP9Video_WebM/0 MSE_ClearKey_Prefixed/EncryptedMediaTest.Playback_VP9Video_WebM/0 MSE_ExternalClearKey_Prefixed/EncryptedMediaTest.Playback_VP9Video_WebM/0 MSE_ExternalClearKey/EncryptedMediaTest.Playback_VP9Video_WebM/0 MSE_ExternalClearKeyDecryptOnly/EncryptedMediaTest.Playback_VP9Video_WebM/0 MSE_ExternalClearKeyDecryptOnly_Prefixed/EncryptedMediaTest.Playback_VP9Video_WebM/0 SRC_ExternalClearKey/EncryptedMediaTest.Playback_VP9Video_WebM/0 SRC_ExternalClearKey_Prefixed/EncryptedMediaTest.Playback_VP9Video_WebM/0 SRC_ClearKey_Prefixed/EncryptedMediaTest.Playback_VP9Video_WebM/0 Patches are This reverts commit `9bc040859b` This reverts commit `6f5aba069a` This reverts commit `9bc040859b` I1f250441 Revert "Refactor the vp9_get_frame code for frame parallel." Ibfdddce5 Revert "Delay decreasing reference count in frame-parallel decoding." I00ce6771 Revert "Introduce FrameWorker for decoding." Need better testing in libvpx for these commits Squashed-Change-Id: Ifa1f279b0cabf4b47c051ec26018f9301c1e130e error check vp9 superframe parsing This patch insures that the last byte of a chunk that contains a valid superframe marker byte, actually has a proper superframe index. If not it returns an error. As part of doing that the file : vp90-2-15-fuzz-flicker.webm now fails to decode properly and moves to the invalid file test from the test vector suite. Squashed-Change-Id: I5f1da7eb37282ec0c6394df5c73251a2df9c1744 Remove unused vp9_init_quant_tables function This function is not effectively used, hence removed. Squashed-Change-Id: I2e8e48fa07c7518931690f3b04bae920cb360e49 Actually skip blocks in skip segments in non-rd encoder. Copy split from macroblock to pick mode context so it doesn't get lost. Squashed-Change-Id: Ie37aa12558dbe65c4f8076cf808250fffb7f27a8 Add Check for Peek Stream validity to decoder test. Squashed-Change-Id: I9b745670a9f842582c47e6001dc77480b31fb6a1 Allocate buffers based on correct chroma format The encoder currently allocates frame buffers before it establishes what the chroma sub-sampling factor is, always allocating based on the 4:4:4 format. This patch detects the chroma format as early as possible allowing the encoder to allocate buffers of the correct size. Future patches will change the encoder to allocate frame buffers on demand to further reduce the memory profile of the encoder and rationalize the buffer management in the encoder and decoder. Squashed-Change-Id: Ifd41dd96e67d0011719ba40fada0bae74f3a0d57 Fork vp9_rd_pick_inter_mode_sb_seg_skip Squashed-Change-Id: I549868725b789f0f4f89828005a65972c20df888 Switch active map implementation to segment based. Squashed-Change-Id: Ibb841a1fa4d08d164cf5461246ec290f582b1f80 Experiment for mid group second arf. This patch implements a mechanism for inserting a second arf at the mid position of arf groups. It is currently disabled by default using the flag multi_arf_enabled. Results are currently down somewhat in initial testing if multi-arf is enabled. Most of the loss is attributable to the fact that code to preserve the previous golden frame (in the arf buffer) in cases where we are coding an overlay frame, is currently disabled in the multi-arf case. Squashed-Change-Id: I1d777318ca09f147db2e8c86d7315fe86168c865 Clean out old CONFIG_MULTIPLE_ARF code. Remove the old experimental multi arf code that was under the flag CONFIG_MULTIPLE_ARF. Squashed-Change-Id: Ib24865abc11691d6ac8cb0434ada1da674368a61 Fix some bugs in multi-arf Fix some bugs relating to the use of buffers in the overlay frames. Fix bug where a mid sequence overlay was propagating large partition and transform sizes into the subsequent frame because of :- sf->last_partitioning_redo_frequency > 1 and sf->tx_size_search_method == USE_LARGESTALL Squashed-Change-Id: Ibf9ef39a5a5150f8cbdd2c9275abb0316c67873a Further dual arf changes: multi_arf_allowed. Add multi_arf_allowed flag. Re-initialize buffer indices every kf. Add some const indicators. Squashed-Change-Id: If86c39153517c427182691d2d4d4b7e90594be71 Fixed VP9 denoiser COPY_BLOCK case Now copies the src to the correct location in the running average buffer. Squashed-Change-Id: I9c83c96dc7a97f42c8df16ab4a9f18b733181f34 Fix test on maximum downscaling limits There is a normative scaling range of (x1/2, x16) for VP9. This patch fixes the maximum downscaling tests that are applied in the convolve function. The code used a maximum downscaling limit of x1/5 for historic reasons related to the scalable coding work. Since the downsampling in this application is non-normative it will revert to using a separate non-normative scaler. Squashed-Change-Id: Ide80ed712cee82fe5cb3c55076ac428295a6019f Add unit test to test user_priv parameter. Squashed-Change-Id: I6ba6171e43e0a43331ee0a7b698590b143979c44 vp9: check tile column count the max is 6. there are assumptions throughout the decode regarding this; fixes a crash with a fuzzed bitstream $ zzuf -s 5861 -r 0.01:0.05 -b 6- \ < vp90-2-00-quantizer-00.webm.ivf \ \| dd of=invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf \ bs=1 count=81883 Squashed-Change-Id: I6af41bb34252e88bc156a4c27c80d505d45f5642 Adjust arf Q limits with multi-arf. Adjust enforced minimum arf Q deltas for non primary arfs in the middle of an arf/gf group. Squashed-Change-Id: Ie8034ffb3ac00f887d74ae1586d4cac91d6cace2 Dual ARF changes: Buffer index selection. Add indirection to the section of buffer indices. This is to help simplify things in the future if we have other codec features that switch indices. Limit the max GF interval for static sections to fit the gf_group structures. Squashed-Change-Id: I38310daaf23fd906004c0e8ee3e99e15570f84cb Reuse inter prediction result in real-time speed 6 In real-time speed 6, no partition search is done. The inter prediction results got from picking mode can be reused in the following encoding process. A speed feature reuse_inter_pred_sby is added to only enable the resue in speed 6. This patch doesn't change encoding result. RTC set tests showed that the encoding speed gain is 2% - 5%. Squashed-Change-Id: I3884780f64ef95dd8be10562926542528713b92c Add vp9_ prefix to mv_pred and setup_pred_block functions Make these two functions accessible by both RD and non-RD coding modes. Squashed-Change-Id: Iecb39dbf3d65436286ea3c7ffaa9920d0b3aff85 Replace cpi->common with preset variable cm This commit replaces a few use cases of cpi->common with preset variable cm, to avoid unnecessary pointer fetch in the non-RD coding mode. Squashed-Change-Id: I4038f1c1a47373b8fd7bc5d69af61346103702f6 [spatial svc]Implement lag in frames for spatial svc Squashed-Change-Id: I930dced169c9d53f8044d2754a04332138347409 [spatial svc]Don't skip motion search in first pass encoding Squashed-Change-Id: Ia6bcdaf5a5b80e68176f60d8d00e9b5cf3f9bfe3 decode_test_driver: fix type size warning like vpx_codec_decode(), vpx_codec_peek_stream_info() takes an unsigned int, not size_t, parameter for buffer size Squashed-Change-Id: I4ce0e1fbbde461c2e1b8fcbaac3cd203ed707460 decode_test_driver: check HasFailure() in RunLoop avoids unnecessary errors due to e.g., read (Next()) failures Squashed-Change-Id: I70b1d09766456f1c55367d98299b5abd7afff842 Allow lossless breakout in non-rd mode decision. This is very helpful for large moving windows in screencasts. Squashed-Change-Id: I91b5f9acb133281ee85ccd8f843e6bae5cadefca Revert "Revert 3 patches from Hangyu to get Chrome to build:" This patch reverts the previous revert from Jim and also add a variable user_priv in the FrameWorker to save the user_priv passed from the application. In the decoder_get_frame function, the user_priv will be binded with the img. This change is needed or it will fail the unit test added here: https://gerrit.chromium.org/gerrit/#/c/70610/ This reverts commit `9be46e4565`. Squashed-Change-Id: I376d9a12ee196faffdf3c792b59e6137c56132c1 test.mk: remove renamed file vp90-2-15-fuzz-flicker.webm was renamed in: `c3db2d8` error check vp9 superframe parsing Squashed-Change-Id: I229dd6ca4c662802c457beea0f7b4128153a65dc vp9cx.mk: move avx c files outside of x86inc block same reasoning as: `9f3a0db` vp9_rtcd: correct avx2 references these are all intrinsics, so don't depend on x86inc.asm Squashed-Change-Id: I915beaef318a28f64bfa5469e5efe90e4af5b827 Dual arf: Name changes. Cosmetic patch only in response to comments on previous patches suggesting a couple of name changes for consistency and clarity. Squashed-Change-Id: Ida3a359b0d5755345660d304a7697a3a3686b2a3 Make non-RD intra mode search txfm size dependent This commit fixes the potential issue in the non-RD mode decision flow that only checks part of the block to estimate the cost. It was due to the use of fixed transform size, in replacing the largest transform block size. This commit enables per transform block cost estimation of the intra prediction mode in the non-RD mode decision. Squashed-Change-Id: I14ff92065e193e3e731c2bbf7ec89db676f1e132 Fix quality regression for multi arf off case. Bug introduced during multiple iterations on: I3831* gf_group->arf_update_idx[] cannot currently be used to select the arf buffer index if buffer flipping on overlays is enabled (still currently the case when multi arf OFF). Squashed-Change-Id: I4ce9ea08f1dd03ac3ad8b3e27375a91ee1d964dc Enable real-time version reference motion vector search This commit enables a fast reference motion vector search scheme. It checks the nearest top and left neighboring blocks to decide the most probable predicted motion vector. If it finds the two have the same motion vectors, it then skip finding exterior range for the second most probable motion vector, and correspondingly skips the check for NEARMV. The runtime of speed -5 goes down pedestrian at 1080p 29377 ms -> 27783 ms vidyo at 720p 11830 ms -> 10990 ms i.e., 6%-8% speed-up. For rtc set, the compression performance goes down by about -1.3% for both speed -5 and -6. Squashed-Change-Id: I2a7794fa99734f739f8b30519ad4dfd511ab91a5 Add const mark to const values in non-RD coding mode Squashed-Change-Id: I65209fd1e06fc06833f6647cb028b414391a7017 Change-Id: Ic0be67ac9ef48f64a8878a0b8f1b336f136bceac	2014-06-26 14:22:05 -07:00
Debargha Mukherjee	77a29953c5	Revert "Migrating old experiments into new playground branch" This reverts commit `1a4b017fad` Change-Id: I7f54cf0489e592887b61eb3f7bda90f757b0aad7	2014-06-26 12:46:51 -07:00
Yue Chen	1a4b017fad	Migrating old experiments into new playground branch Change-Id: I28dc4acdf5415a1ea3d88213022d9e3d4fd5db46	2014-06-23 16:35:38 -07:00