Remove unnecessary check for frame parallel decode.

The check will make the code break out from the loop earlier. But without the check. it works the same in non frame-parallel decode. In frame parallel decode, this check is wrong as there may be left frames inside the decoder even if there are no more frames to read. So the loop should continue until got_data = 0. Change-Id: I42937cec5d80d1d921a008d78dafce0928c727e0
Add more functions for FrameWorker.
2014-07-08 15:43:51 -07:00 · 2014-07-08 11:50:57 -07:00 · 2014-07-02 19:28:38 -07:00 · 2014-07-02 15:34:20 -07:00 · 2014-07-02 14:57:39 -07:00 · 2014-07-02 09:49:34 -07:00
109 changed files with 5939 additions and 2717 deletions
--- a/1
+++ b/1
@@ -55,6 +55,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv6-linux-rvct
    armv6-linux-gcc
    armv6-none-rvct
+    arm64-darwin-gcc
    armv7-android-gcc
    armv7-darwin-gcc
    armv7-linux-rvct
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -799,7 +799,7 @@ process_common_toolchain() {
    arm*)
        # on arm, isa versions are supersets
        case ${tgt_isa} in
-        armv8)
+        arm64|armv8)
            soft_enable neon
            ;;
        armv7|armv7s)
@@ -1048,14 +1048,6 @@ EOF
        esac
    ;;
    x86*)
-        bits=32
-        enabled x86_64 && bits=64
-        check_cpp <<EOF && bits=x32
-#ifndef __ILP32__
-#error "not x32"
-#endif
-EOF
-
        case  ${tgt_os} in
            win*)
                enabled gcc && add_cflags -fno-common
@@ -1094,8 +1086,6 @@ EOF
                esac
            ;;
            gcc*)
-                add_cflags -m${bits}
-                add_ldflags -m${bits}
                link_with_cc=gcc
                tune_cflags="-march="
                setup_gnu_toolchain
@@ -1120,6 +1110,20 @@ EOF
            ;;
        esac

+        bits=32
+        enabled x86_64 && bits=64
+        check_cpp <<EOF && bits=x32
+#ifndef __ILP32__
+#error "not x32"
+#endif
+EOF
+        case ${tgt_cc} in
+            gcc*)
+                add_cflags -m${bits}
+                add_ldflags -m${bits}
+            ;;
+        esac
+
        soft_enable runtime_cpu_detect
        # We can't use 'check_cflags' until the compiler is configured and CC is
        # populated.
@@ -1222,10 +1226,12 @@ EOF
        fi
    fi

-    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o \
-         "${tgt_os#darwin}" = "${tgt_os}"  ]; then
-      soft_enable use_x86inc
+    tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
+    # Default use_x86inc to yes when we are 64 bit, non-pic, or on any
+    # non-Darwin target.
+    if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \
+            [ "${tgt_os_no_version}" != "darwin" ]; then
+        soft_enable use_x86inc
    fi

    # Position Independent Code (PIC) support, for building relocatable
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -137,7 +137,9 @@ for opt in "$@"; do
        ;;
        --lib) proj_kind="lib"
        ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
        ;;
        --static-crt) use_static_runtime=true
        ;;
@@ -151,9 +153,9 @@ for opt in "$@"; do
            esac
        ;;
        -I*)
-            opt="${opt%/}"
            opt=${opt##-I}
            opt=$(fix_path "$opt")
+            opt="${opt%/}"
            incs="${incs}${incs:+;}&quot;${opt}&quot;"
            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
        ;;
@@ -414,7 +416,7 @@ generate_vcproj() {
                    vpx)
                        tag Tool \
                            Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
+                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \

                        tag Tool \
                            Name="VCCLCompilerTool" \
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -157,7 +157,9 @@ for opt in "$@"; do
        ;;
        --lib) proj_kind="lib"
        ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
        ;;
        --static-crt) use_static_runtime=true
        ;;
@@ -173,9 +175,9 @@ for opt in "$@"; do
            esac
        ;;
        -I*)
-            opt="${opt%/}"
            opt=${opt##-I}
            opt=$(fix_path "$opt")
+            opt="${opt%/}"
            incs="${incs}${incs:+;}&quot;${opt}&quot;"
            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
        ;;
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -25,7 +25,8 @@ MAKE_JOBS=1
 LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-TARGETS="armv6-darwin-gcc
+TARGETS="arm64-darwin-gcc
+         armv6-darwin-gcc
         armv7-darwin-gcc
         armv7s-darwin-gcc
         x86-iphonesimulator-gcc
@@ -50,9 +51,82 @@ build_target() {
  vlog "***Done building target: ${target}***"
 }

+# Returns the preprocessor symbol for the target specified by $1.
+target_to_preproc_symbol() {
+  target="$1"
+  case "${target}" in
+    arm64-*)
+      echo "__aarch64__"
+      ;;
+    armv6-*)
+      echo "__ARM_ARCH_6__"
+      ;;
+    armv7-*)
+      echo "__ARM_ARCH_7__"
+      ;;
+    armv7s-*)
+      echo "__ARM_ARCH_7S__"
+      ;;
+    x86-*)
+      echo "__i386__"
+      ;;
+    x86_64-*)
+      echo "__x86_64__"
+      ;;
+    *)
+      echo "#error ${target} unknown/unsupported"
+      return 1
+      ;;
+  esac
+}
+
+# Create a vpx_config.h shim that, based on preprocessor settings for the
+# current target CPU, includes the real vpx_config.h for the current target.
+# $1 is the list of targets.
+create_vpx_framework_config_shim() {
+  local targets="$1"
+  local config_file="${HEADER_DIR}/vpx_config.h"
+  local preproc_symbol=""
+  local target=""
+  local include_guard="VPX_FRAMEWORK_HEADERS_VPX_VPX_CONFIG_H_"
+
+  local file_header="/*
+ *  Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* GENERATED FILE: DO NOT EDIT! */
+
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#if defined"
+
+  printf "%s" "${file_header}" > "${config_file}"
+  for target in ${targets}; do
+    preproc_symbol=$(target_to_preproc_symbol "${target}")
+    printf " ${preproc_symbol}\n" >> "${config_file}"
+    printf "#include \"VPX/vpx/${target}/vpx_config.h\"\n" >> "${config_file}"
+    printf "#elif defined" >> "${config_file}"
+    mkdir "${HEADER_DIR}/${target}"
+    cp -p "${BUILD_ROOT}/${target}/vpx_config.h" "${HEADER_DIR}/${target}"
+  done
+
+  # Consume the last line of output from the loop: We don't want it.
+  sed -i '' -e '$d' "${config_file}"
+
+  printf "#endif\n\n" >> "${config_file}"
+  printf "#endif  // ${include_guard}" >> "${config_file}"
+}
+
 # Configures and builds each target specified by $1, and then builds
 # VPX.framework.
-build_targets() {
+build_framework() {
  local lib_list=""
  local targets="$1"
  local target=""
@@ -75,15 +149,20 @@ build_targets() {

  cd "${ORIG_PWD}"

-  # Includes are identical for all platforms, and according to dist target
-  # behavior vpx_config.h and vpx_version.h aren't actually necessary for user
-  # apps built with libvpx. So, just copy the includes from the last target
-  # built.
-  # TODO(tomfinegan): The above is a lame excuse. Build common config/version
-  # includes that use the preprocessor to include the correct file.
+  # The basic libvpx API includes are all the same; just grab the most recent
+  # set.
  cp -p "${target_dist_dir}"/include/vpx/* "${HEADER_DIR}"
+
+  # Build the fat library.
  ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/VPX

+  # Create the vpx_config.h shim that allows usage of vpx_config.h from
+  # within VPX.framework.
+  create_vpx_framework_config_shim "${targets}"
+
+  # Copy in vpx_version.h.
+  cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"
+
  vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
  for lib in ${lib_list}; do
    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
@@ -166,4 +245,4 @@ cat << EOF
 EOF
 fi

-build_targets "${TARGETS}"
+build_framework "${TARGETS}"
--- a/2
+++ b/2
@@ -96,6 +96,7 @@ all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
+all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -273,6 +274,7 @@ EXPERIMENT_LIST="
    multiple_arf
    spatial_svc
    denoising
+    fp_mb_stats
 "
 CONFIG_LIST="
    external_build
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -28,16 +28,6 @@
 #include "vpx/vpx_encoder.h"
 #include "./vpxstats.h"

-static const struct arg_enum_list encoding_mode_enum[] = {
-  {"i", INTER_LAYER_PREDICTION_I},
-  {"alt-ip", ALT_INTER_LAYER_PREDICTION_IP},
-  {"ip", INTER_LAYER_PREDICTION_IP},
-  {"gf", USE_GOLDEN_FRAME},
-  {NULL, 0}
-};
-
-static const arg_def_t encoding_mode_arg = ARG_DEF_ENUM(
-    "m", "encoding-mode", 1, "Encoding mode algorithm", encoding_mode_enum);
 static const arg_def_t skip_frames_arg =
    ARG_DEF("s", "skip-frames", 1, "input frames to skip");
 static const arg_def_t frames_arg =
@@ -58,9 +48,6 @@ static const arg_def_t quantizers_arg =
    ARG_DEF("q", "quantizers", 1, "quantizers for non key frames, also will "
            "be applied to key frames if -qn is not specified (lowest to "
            "highest layer)");
-static const arg_def_t quantizers_keyframe_arg =
-    ARG_DEF("qn", "quantizers-keyframe", 1, "quantizers for key frames (lowest "
-        "to highest layer)");
 static const arg_def_t passes_arg =
    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
 static const arg_def_t pass_arg =
@@ -77,16 +64,13 @@ static const arg_def_t max_bitrate_arg =
    ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");

 static const arg_def_t *svc_args[] = {
-  &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
+  &frames_arg,        &width_arg,         &height_arg,
  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
-  &quantizers_keyframe_arg,               &passes_arg,      &pass_arg,
-  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
-  &max_bitrate_arg,   NULL
+  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  &passes_arg,
+  &pass_arg,          &fpf_name_arg,      &min_q_arg,       &max_q_arg,
+  &min_bitrate_arg,   &max_bitrate_arg,   NULL
 };

-static const SVC_ENCODING_MODE default_encoding_mode =
-    INTER_LAYER_PREDICTION_IP;
 static const uint32_t default_frames_to_skip = 0;
 static const uint32_t default_frames_to_code = 60 * 60;
 static const uint32_t default_width = 1920;
@@ -135,7 +119,6 @@ static void parse_command_line(int argc, const char **argv_,
  // initialize SvcContext with parameters that will be passed to vpx_svc_init
  svc_ctx->log_level = SVC_LOG_DEBUG;
  svc_ctx->spatial_layers = default_spatial_layers;
-  svc_ctx->encoding_mode = default_encoding_mode;

  // start with default encoder configuration
  res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -161,9 +144,7 @@ static void parse_command_line(int argc, const char **argv_,
  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
    arg.argv_step = 1;

-    if (arg_match(&arg, &encoding_mode_arg, argi)) {
-      svc_ctx->encoding_mode = arg_parse_enum_or_int(&arg);
-    } else if (arg_match(&arg, &frames_arg, argi)) {
+    if (arg_match(&arg, &frames_arg, argi)) {
      app_input->frames_to_code = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &width_arg, argi)) {
      enc_cfg->g_w = arg_parse_uint(&arg);
@@ -183,9 +164,7 @@ static void parse_command_line(int argc, const char **argv_,
    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
      vpx_svc_set_scale_factors(svc_ctx, arg.val);
    } else if (arg_match(&arg, &quantizers_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val, 0);
-    } else if (arg_match(&arg, &quantizers_keyframe_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val, 1);
+      vpx_svc_set_quantizers(svc_ctx, arg.val);
    } else if (arg_match(&arg, &passes_arg, argi)) {
      passes = arg_parse_uint(&arg);
      if (passes < 1 || passes > 2) {
@@ -270,12 +249,12 @@ static void parse_command_line(int argc, const char **argv_,

  printf(
      "Codec %s\nframes: %d, skip: %d\n"
-      "mode: %d, layers: %d\n"
+      "layers: %d\n"
      "width %d, height: %d,\n"
      "num: %d, den: %d, bitrate: %d,\n"
      "gop size: %d\n",
      vpx_codec_iface_name(vpx_codec_vp9_cx()), app_input->frames_to_code,
-      app_input->frames_to_skip, svc_ctx->encoding_mode,
+      app_input->frames_to_skip,
      svc_ctx->spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
@@ -296,6 +275,7 @@ int main(int argc, const char **argv) {
  int frame_duration = 1; /* 1 timebase tick per frame */
  FILE *infile = NULL;
  int end_of_stream = 0;
+  int frame_size;

  memset(&svc_ctx, 0, sizeof(svc_ctx));
  svc_ctx.log_print = 1;
@@ -351,11 +331,10 @@ int main(int argc, const char **argv) {
      die_codec(&codec, "Failed to encode frame");
    }
    if (!(app_input.passes == 2 && app_input.pass == 1)) {
-      if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
+      while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
        vpx_video_writer_write_frame(writer,
                                     vpx_svc_get_buffer(&svc_ctx),
-                                     vpx_svc_get_frame_size(&svc_ctx),
-                                     pts);
+                                     frame_size, pts);
      }
    }
    if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
--- a/libs.mk
+++ b/libs.mk
@@ -170,7 +170,7 @@ CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-CODEC_SRCS-$(BUILD_LIBVPX) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
 endif
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -14,30 +14,49 @@
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"

 namespace {

+const int kMaxPSNR = 100;
+
 class CpuSpeedTest : public ::libvpx_test::EncoderTest,
    public ::libvpx_test::CodecTestWith2Params<
        libvpx_test::TestMode, int> {
 protected:
-  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
+  CpuSpeedTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)),
+        min_psnr_(kMaxPSNR) {}
  virtual ~CpuSpeedTest() {}

  virtual void SetUp() {
    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPSNR;
  }

  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
-      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
-      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
    }
  }

@@ -45,7 +64,15 @@ class CpuSpeedTest : public ::libvpx_test::EncoderTest,
    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
    }
  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_)
+      min_psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
  int set_cpu_used_;
+  double min_psnr_;
 };

 TEST_P(CpuSpeedTest, TestQ0) {
@@ -53,7 +80,6 @@ TEST_P(CpuSpeedTest, TestQ0) {
  // without a mismatch when passing in a very low max q.  This pushes
  // the encoder to producing lots of big partitions which will likely
  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 400;
@@ -63,16 +89,32 @@ TEST_P(CpuSpeedTest, TestQ0) {
  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
                                       20);

+  init_flags_ = VPX_CODEC_USE_PSNR;
+
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
 }

+TEST_P(CpuSpeedTest, TestScreencastQ0) {
+  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}

 TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
  // Validate that this non multiple of 64 wide clip encodes and decodes
  // without a mismatch when passing in a very low max q.  This pushes
  // the encoder to producing lots of big partitions which will likely
  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 12000;
@@ -89,7 +131,6 @@ TEST_P(CpuSpeedTest, TestLowBitrate) {
  // when passing in a very high min q.  This pushes the encoder to producing
  // lots of small partitions which might will test the other condition.

-  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 200;
@@ -108,6 +149,7 @@ using std::tr1::make_tuple;

 VP9_INSTANTIATE_TEST_CASE(
    CpuSpeedTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+                      ::libvpx_test::kRealTime),
    ::testing::Range(0, 8));
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -576,7 +576,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
    // Expect some frame drops in this test: for this 200 frames test,
    // expect at least 10% and not more than 60% drops.
    ASSERT_GE(num_drops_, 20);
-    ASSERT_LE(num_drops_, 120);
+    ASSERT_LE(num_drops_, 130);
  }
 }

--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -15,13 +15,27 @@

 namespace libvpx_test {

+const char kVP8Name[] = "WebM Project VP8";
+
+vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+                                    vpx_codec_stream_info_t *stream_info) {
+  return vpx_codec_peek_stream_info(CodecInterface(),
+                                    cxdata, static_cast<unsigned int>(size),
+                                    stream_info);
+}
+
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+  return DecodeFrame(cxdata, size, NULL);
+}
+
+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+                                     void *user_priv) {
  vpx_codec_err_t res_dec;
  InitOnce();
  REGISTER_STATE_CHECK(
      res_dec = vpx_codec_decode(&decoder_,
                                 cxdata, static_cast<unsigned int>(size),
-                                 NULL, 0));
+                                 user_priv, 0));
  return res_dec;
 }

@@ -29,13 +43,37 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {
  vpx_codec_dec_cfg_t dec_cfg = {0};
  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
  ASSERT_TRUE(decoder != NULL);
+  const char *codec_name = decoder->GetDecoderName();
+  const bool is_vp8 = strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;

  // Decode frames.
-  for (video->Begin(); video->cxdata(); video->Next()) {
+  for (video->Begin(); !::testing::Test::HasFailure() && video->cxdata();
+       video->Next()) {
    PreDecodeFrameHook(*video, decoder);
+
+    vpx_codec_stream_info_t stream_info;
+    stream_info.sz = sizeof(stream_info);
+    const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
+                                                         video->frame_size(),
+                                                         &stream_info);
+    if (is_vp8) {
+      /* Vp8's implementation of PeekStream returns an error if the frame you
+       * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
+       * frame, which must be a keyframe. */
+      if (video->frame_number() == 0)
+        ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+            << vpx_codec_err_to_string(res_peek);
+    } else {
+      /* The Vp9 implementation of PeekStream returns an error only if the
+       * data passed to it isn't a valid Vp9 chunk. */
+      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+          << vpx_codec_err_to_string(res_peek);
+    }
+
    vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
                                                   video->frame_size());
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (!HandleDecodeResult(res_dec, *video, decoder))
+      break;

    DxDataIterator dec_iter = decoder->GetDxData();
    const vpx_image_t *img = NULL;
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -49,8 +49,14 @@ class Decoder {
    vpx_codec_destroy(&decoder_);
  }

+  vpx_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+                             vpx_codec_stream_info_t *stream_info);
+
  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);

+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+                              void *user_priv);
+
  DxDataIterator GetDxData() {
    return DxDataIterator(&decoder_);
  }
@@ -85,6 +91,10 @@ class Decoder {
        &decoder_, cb_get, cb_release, user_priv);
  }

+  const char* GetDecoderName() {
+    return vpx_codec_iface_name(CodecInterface());
+  }
+
 protected:
  virtual vpx_codec_iface_t* CodecInterface() const = 0;

@@ -114,6 +124,14 @@ class DecoderTest {
  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
                                  Decoder *decoder) {}

+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const CompressedVideoSource& /* video */,
+                                  Decoder *decoder) {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
  // Hook to be called on every decompressed frame.
  virtual void DecompressedFrameHook(const vpx_image_t& img,
                                     const unsigned int frame_number) {}
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+class InvalidFileTest
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL)
+      fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name_) {
+    res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
+    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
+        << res_file_name_;
+  }
+
+  virtual bool HandleDecodeResult(
+      const vpx_codec_err_t res_dec,
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    // Check results match.
+    EXPECT_EQ(expected_res_dec, res_dec)
+        << "Results don't match: frame number = " << video.frame_number();
+
+    return !HasFailure();
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) {
+  const std::string filename = GET_PARAM(1);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+#if CONFIG_WEBM_IO
+    video = new libvpx_test::WebMVideoSource(filename);
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  video->Init();
+
+  // Construct result file name. The file holds a list of expected integer
+  // results, one for each decoded frame.  Any result that doesn't match
+  // the files list will cause a test failure.
+  const std::string res_filename = filename + ".res";
+  OpenResFile(res_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+const char *const kVP9InvalidFileTests[] = {
+  "invalid-vp90-01.webm",
+  "invalid-vp90-02.webm",
+  "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf",
+  "invalid-vp90-03.webm",
+  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf",
+  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf",
+};
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP9InvalidFileTests,
+                                              kVP9InvalidFileTests +
+                                              NELEMENTS(kVP9InvalidFileTests)));
+
+}  // namespace
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -28,10 +28,11 @@ class MD5 {
      // plane, we never want to round down and thus skip a pixel so if
      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
      // This works only for chroma_shift of 0 and 1.
+      const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
                    img->y_chroma_shift : img->d_h;
-      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
-                    img->x_chroma_shift : img->d_w;
+      const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                     img->x_chroma_shift : img->d_w) * bytes_per_sample;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -41,7 +41,6 @@ class SvcTest : public ::testing::Test {
  virtual ~SvcTest() {}

  virtual void SetUp() {
-    svc_.encoding_mode = INTER_LAYER_PREDICTION_IP;
    svc_.log_level = SVC_LOG_DEBUG;
    svc_.log_print = 0;

@@ -131,22 +130,13 @@ TEST_F(SvcTest, SetLayersOption) {
  EXPECT_EQ(3, svc_.spatial_layers);
 }

-TEST_F(SvcTest, SetEncodingMode) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-  EXPECT_EQ(ALT_INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
-}
-
 TEST_F(SvcTest, SetMultipleOptions) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_OK, res);
  codec_initialized_ = true;
  EXPECT_EQ(2, svc_.spatial_layers);
-  EXPECT_EQ(INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
 }

 TEST_F(SvcTest, SetScaleFactorsOption) {
@@ -177,48 +167,20 @@ TEST_F(SvcTest, SetQuantizersOption) {
  codec_initialized_ = true;
 }

-TEST_F(SvcTest, SetKeyFrameQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_,
-                                       "quantizers-keyframe=not-quantizers");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "quantizers-keyframe=40,45");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
 TEST_F(SvcTest, SetQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30", 0);
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);

-  res = vpx_svc_set_quantizers(&svc_, NULL, 0);
+  res = vpx_svc_set_quantizers(&svc_, NULL);
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);

  svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40", 0);
+  res = vpx_svc_set_quantizers(&svc_, "40");
  EXPECT_EQ(VPX_CODEC_OK, res);
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);

-  res = vpx_svc_set_quantizers(&svc_, "40,30", 0);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
-TEST_F(SvcTest, SetKeyFrameQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,31", 1);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, NULL, 1);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, "40,30", 1);
+  res = vpx_svc_set_quantizers(&svc_, "40,30");
  EXPECT_EQ(VPX_CODEC_OK, res);
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_OK, res);
@@ -249,7 +211,7 @@ TEST_F(SvcTest, SetScaleFactors) {
 TEST_F(SvcTest, FirstFrameHasLayers) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -265,9 +227,17 @@ TEST_F(SvcTest, FirstFrameHasLayers) {
                       video.duration(), VPX_DL_GOOD_QUALITY);
  EXPECT_EQ(VPX_CODEC_OK, res);

+  if (vpx_svc_get_frame_size(&svc_) == 0) {
+    // Flush encoder
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+  }
+
+  int frame_size = vpx_svc_get_frame_size(&svc_);
+  EXPECT_GT(frame_size, 0);
  const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);

  // this test fails with a decoder error
  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
@@ -276,7 +246,10 @@ TEST_F(SvcTest, FirstFrameHasLayers) {
 TEST_F(SvcTest, EncodeThreeFrames) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -291,13 +264,14 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  // This frame is a keyframe.
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));

-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 1
  video.Next();
@@ -305,12 +279,14 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 2
  video.Next();
@@ -318,18 +294,35 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }

 TEST_F(SvcTest, GetLayerResolution) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -368,7 +361,7 @@ TEST_F(SvcTest, TwoPassEncode) {
  svc_.spatial_layers = 2;
  codec_enc_.g_pass = VPX_RC_FIRST_PASS;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -413,6 +406,9 @@ TEST_F(SvcTest, TwoPassEncode) {
  vpx_codec_destroy(&codec_);

  // Second pass encode
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;
  codec_enc_.g_pass = VPX_RC_LAST_PASS;
  codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
  codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();
@@ -427,12 +423,14 @@ TEST_F(SvcTest, TwoPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));

-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 1
  video.Next();
@@ -440,12 +438,14 @@ TEST_F(SvcTest, TwoPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 2
  video.Next();
@@ -453,12 +453,29 @@ TEST_F(SvcTest, TwoPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }

 }  // namespace
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -1,5 +1,20 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
 b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
+fe346136b9b8c1e6f6084cc106485706915795e4  invalid-vp90-01.webm
+25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01.webm.res
+d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02.webm
+2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03.webm
+8fe6fd82bf537340f586f97a7ae31fb37ccda302  invalid-vp90-03.webm.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5  park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17  park_joy_90p_10_444.y4m
+614c32ae1eca391e867c70d19974f0d62664dd99  park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e  park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34  park_joy_90p_12_444.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c  park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947  park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03  park_joy_90p_8_444.y4m
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
@@ -529,8 +544,6 @@ b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
 7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
 bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
 f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
-495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
-65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
 0c83a1e414fde3bccd6dc451bbaee68e59974c76  vp90-2-07-frame_parallel.webm
 e5c2c9fb383e5bf3b563480adaeba5b7e3475ecd  vp90-2-07-frame_parallel.webm.md5
 086c7edcffd699ae7d99d710fd7e53b18910ca5b  vp90-2-08-tile_1x2_frame_parallel.webm
@@ -576,6 +589,8 @@ d48c5db1b0f8e60521a7c749696b8067886033a3  vp90-2-09-aq2.webm
 54638c38009198c38c8f3b25c182b709b6c1fd2e  vp90-2-09-lf_deltas.webm.md5
 510d95f3beb3b51c572611fdaeeece12277dac30  vp90-2-10-show-existing-frame.webm
 14d631096f4bfa2d71f7f739aec1448fb3c33bad  vp90-2-10-show-existing-frame.webm.md5
+d2feea7728e8d2c615981d0f47427a4a5a45d881  vp90-2-10-show-existing-frame2.webm
+5f7c7811baa3e4f03be1dd78c33971b727846821  vp90-2-10-show-existing-frame2.webm.md5
 b4318e75f73a6a08992c7326de2fb589c2a794c7  vp90-2-11-size-351x287.webm
 b3c48382cf7d0454e83a02497c229d27720f9e20  vp90-2-11-size-351x287.webm.md5
 8e0096475ea2535bac71d3e2fc09e0c451c444df  vp90-2-11-size-351x288.webm
@@ -638,5 +653,11 @@ e615575ded499ea1d992f3b38e3baa434509cdcd  vp90-2-15-segkey.webm
 e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
 9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
 8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
-d78e2fceba5ac942246503ec8366f879c4775ca5  vp90-2-15-fuzz-flicker.webm
-bbd7dd15f43a703ff0a332fee4959e7b23bf77dc  vp90-2-15-fuzz-flicker.webm.md5
+0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
+367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
+76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
+83f50908c8dc0ef8760595447a2ff7727489542e  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+c123d1f9f02fb4143abb5e271916e3a3080de8f6  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
--- a/test/test.mk
+++ b/test/test.mk
@@ -15,7 +15,7 @@ LIBVPX_TEST_SRCS-yes += video_source.h
 ##
 ## Black box tests only use the public API.
 ##
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
@@ -30,6 +30,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
@@ -41,6 +42,9 @@ LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h

+## Y4m parsing.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h
+
 ## WebM Parsing
 ifeq ($(CONFIG_WEBM_IO), yes)
 LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
@@ -54,6 +58,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 endif

+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc

 # Currently we only support decoder perf tests for vp9. Also they read from WebM
@@ -132,7 +137,19 @@ endif # CONFIG_SHARED
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
+
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
@@ -690,6 +707,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
@@ -704,8 +723,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
@@ -754,8 +771,22 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-fuzz-flicker.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-fuzz-flicker.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
+
+# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res

 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -161,10 +161,11 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-08-tile-4x1.webm", "vp90-2-09-subpixel-00.ivf",
  "vp90-2-02-size-lf-1920x1080.webm", "vp90-2-09-aq2.webm",
  "vp90-2-09-lf_deltas.webm", "vp90-2-10-show-existing-frame.webm",
+  "vp90-2-10-show-existing-frame2.webm",
  "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
  "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
  "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
-  "vp90-2-13-largescaling.webm", "vp91-2-04-yv444.webm",
+  "vp90-2-13-largescaling.webm",
  "vp90-2-14-resize-fp-tiles-1-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",
@@ -179,7 +180,7 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
-  "vp90-2-15-fuzz-flicker.webm"
+  "vp91-2-04-yuv444.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vp8.h"
+
+namespace {
+
+using std::string;
+using libvpx_test::ACMRandom;
+
+#if CONFIG_WEBM_IO
+
+void CheckUserPrivateData(void *user_priv, int *target) {
+  // actual pointer value should be the same as expected.
+  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv) <<
+      "user_priv pointer value does not match.";
+}
+
+// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
+// compares the user_priv from return img with the original user_priv to see if
+// they match. Both the pointer values and the values inside the addresses
+// should match.
+string DecodeFile(const string &filename) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  int frame_num = 0;
+  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
+       video.Next()) {
+    void *user_priv = reinterpret_cast<void *>(&frame_num);
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size(),
+                            (frame_num == 0) ? NULL : user_priv);
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data.
+    while ((img = dec_iter.Next())) {
+      if (frame_num == 0) {
+        CheckUserPrivateData(img->user_priv, NULL);
+      } else {
+        CheckUserPrivateData(img->user_priv, &frame_num);
+
+        // Also test ctrl_get_reference api.
+        struct vp9_ref_frame ref;
+        // Randomly fetch a reference frame.
+        ref.idx = rnd.Rand8() % 3;
+        decoder.Control(VP9_GET_REFERENCE, &ref);
+
+        CheckUserPrivateData(ref.img.user_priv, NULL);
+      }
+      md5.Add(img);
+    }
+
+    frame_num++;
+  }
+  return string(md5.Get());
+}
+
+TEST(UserPrivTest, VideoDecode) {
+  // no tiles or frame parallel; this exercises the decoding to test the
+  // user_priv.
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("vp90-2-03-size-226x226.webm").c_str());
+}
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -50,6 +50,15 @@ static FILE *OpenTestDataFile(const std::string& file_name) {
  return fopen(path_to_source.c_str(), "rb");
 }

+static FILE *OpenTestOutFile(const std::string& file_name) {
+  const std::string path_to_source = GetDataPath() + "/" + file_name;
+  return fopen(path_to_source.c_str(), "wb");
+}
+
+static FILE *OpenTempOutFile() {
+  return tmpfile();
+}
+
 // Abstract base class for test video sources, which provide a stream of
 // vpx_image_t images with associated timestamps and duration.
 class VideoSource {
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -36,6 +36,17 @@ class LosslessTestLarge : public ::libvpx_test::EncoderTest,
    SetMode(encoding_mode_);
  }

+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      // Only call Control if quantizer > 0 to verify that using quantizer
+      // alone will activate lossless
+      if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
+        encoder->Control(VP9E_SET_LOSSLESS, 1);
+      }
+    }
+  }
+
  virtual void BeginPassHook(unsigned int /*pass*/) {
    psnr_ = kMaxPsnr;
    nframes_ = 0;
@@ -91,5 +102,24 @@ TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
  EXPECT_GE(psnr_lossless, kMaxPsnr);
 }

+TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 10);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
 VP9_INSTANTIATE_TEST_CASE(LosslessTestLarge, ALL_TEST_MODES);
 }  // namespace
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -28,11 +28,11 @@ class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> {
 protected:
  virtual ~VP9WorkerThreadTest() {}
  virtual void SetUp() {
-    vp9_worker_init(&worker_);
+    vp9_get_worker_interface()->init(&worker_);
  }

  virtual void TearDown() {
-    vp9_worker_end(&worker_);
+    vp9_get_worker_interface()->end(&worker_);
  }

  VP9Worker worker_;
@@ -45,10 +45,11 @@ int ThreadHook(void* data, void* return_value) {
 }

 TEST_P(VP9WorkerThreadTest, HookSuccess) {
-  EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.
+  // should be a no-op.
+  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);

  for (int i = 0; i < 2; ++i) {
-    EXPECT_NE(vp9_worker_reset(&worker_), 0);
+    EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);

    int hook_data = 0;
    int return_value = 1;  // return successfully from the hook
@@ -58,20 +59,21 @@ TEST_P(VP9WorkerThreadTest, HookSuccess) {

    const bool synchronous = GetParam();
    if (synchronous) {
-      vp9_worker_execute(&worker_);
+      vp9_get_worker_interface()->execute(&worker_);
    } else {
-      vp9_worker_launch(&worker_);
+      vp9_get_worker_interface()->launch(&worker_);
    }
-    EXPECT_NE(vp9_worker_sync(&worker_), 0);
+    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
    EXPECT_FALSE(worker_.had_error);
    EXPECT_EQ(5, hook_data);

-    EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.
+    // should be a no-op.
+    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
  }
 }

 TEST_P(VP9WorkerThreadTest, HookFailure) {
-  EXPECT_NE(vp9_worker_reset(&worker_), 0);
+  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);

  int hook_data = 0;
  int return_value = 0;  // return failure from the hook
@@ -81,26 +83,49 @@ TEST_P(VP9WorkerThreadTest, HookFailure) {

  const bool synchronous = GetParam();
  if (synchronous) {
-    vp9_worker_execute(&worker_);
+    vp9_get_worker_interface()->execute(&worker_);
  } else {
-    vp9_worker_launch(&worker_);
+    vp9_get_worker_interface()->launch(&worker_);
  }
-  EXPECT_FALSE(vp9_worker_sync(&worker_));
+  EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_));
  EXPECT_EQ(1, worker_.had_error);

  // Ensure _reset() clears the error and _launch() can be called again.
  return_value = 1;
-  EXPECT_NE(vp9_worker_reset(&worker_), 0);
+  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
  EXPECT_FALSE(worker_.had_error);
-  vp9_worker_launch(&worker_);
-  EXPECT_NE(vp9_worker_sync(&worker_), 0);
+  vp9_get_worker_interface()->launch(&worker_);
+  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
  EXPECT_FALSE(worker_.had_error);
 }

+TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
+  EXPECT_EQ(0, vp9_set_worker_interface(NULL));
+  EXPECT_TRUE(vp9_get_worker_interface() != NULL);
+  for (int i = 0; i < 6; ++i) {
+    VP9WorkerInterface winterface = *vp9_get_worker_interface();
+    switch (i) {
+      default:
+      case 0: winterface.init = NULL; break;
+      case 1: winterface.reset = NULL; break;
+      case 2: winterface.sync = NULL; break;
+      case 3: winterface.launch = NULL; break;
+      case 4: winterface.execute = NULL; break;
+      case 5: winterface.end = NULL; break;
+    }
+    EXPECT_EQ(0, vp9_set_worker_interface(&winterface));
+  }
+}
+
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests

 #if CONFIG_WEBM_IO
+struct FileList {
+  const char *name;
+  const char *expected_md5;
+};
+
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string& filename, int num_threads) {
  libvpx_test::WebMVideoSource video(filename);
@@ -130,39 +155,77 @@ string DecodeFile(const string& filename, int num_threads) {
  return string(md5.Get());
 }

-TEST(VP9DecodeMTTest, MTDecode) {
-  // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
-               DecodeFile("vp90-2-03-size-226x226.webm", 2).c_str());
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
+          << "threads = " << t;
+    }
+  }
 }

-TEST(VP9DecodeMTTest, MTDecode2) {
-  static const struct {
-    const char *name;
-    const char *expected_md5;
-  } files[] = {
+// Trivial serialized thread worker interface implementation.
+// Note any worker that requires synchronization between other workers will
+// hang.
+namespace impl {
+
+void Init(VP9Worker *const worker) { memset(worker, 0, sizeof(*worker)); }
+int Reset(VP9Worker *const /*worker*/) { return 1; }
+int Sync(VP9Worker *const worker) { return !worker->had_error; }
+
+void Execute(VP9Worker *const worker) {
+  worker->had_error |= worker->hook(worker->data1, worker->data2);
+}
+
+void Launch(VP9Worker *const worker) { Execute(worker); }
+void End(VP9Worker *const /*worker*/) {}
+
+}  // namespace impl
+
+TEST(VP9WorkerThreadTest, TestSerialInterface) {
+  static const VP9WorkerInterface serial_interface = {
+    impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
+  };
+  // TODO(jzern): Avoid using a file that will use the row-based thread
+  // loopfilter, with the simple serialized implementation it will hang. This is
+  // due to its expectation that rows will be run in parallel as they wait on
+  // progress in the row above before proceeding.
+  static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
+  static const char filename[] = "vp90-2-03-size-226x226.webm";
+  VP9WorkerInterface default_interface = *vp9_get_worker_interface();
+
+  EXPECT_NE(vp9_set_worker_interface(&serial_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+
+  // Reset the interface.
+  EXPECT_NE(vp9_set_worker_interface(&default_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+}
+
+TEST(VP9DecodeMultiThreadedTest, Decode) {
+  // no tiles or frame parallel; this exercises loop filter threading.
+  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
+            DecodeFile("vp90-2-03-size-226x226.webm", 2));
+}
+
+TEST(VP9DecodeMultiThreadedTest, Decode2) {
+  static const FileList files[] = {
    { "vp90-2-08-tile_1x2_frame_parallel.webm",
      "68ede6abd66bae0a2edf2eb9232241b6" },
    { "vp90-2-08-tile_1x4_frame_parallel.webm",
      "368ebc6ebf3a5e478d85b2c3149b2848" },
    { "vp90-2-08-tile_1x8_frame_parallel.webm",
      "17e439da2388aff3a0f69cb22579c6c1" },
+    { NULL, NULL }
  };

-  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
-          << "threads = " << t;
-    }
-  }
+  DecodeFiles(files);
 }

 // Test tile quantity changes within one file.
-TEST(VP9DecodeMTTest, MTDecode3) {
-  static const struct {
-    const char *name;
-    const char *expected_md5;
-  } files[] = {
+TEST(VP9DecodeMultiThreadedTest, Decode3) {
+  static const FileList files[] = {
    { "vp90-2-14-resize-fp-tiles-1-16.webm",
      "0cd5e632c326297e975f38949c31ea94" },
    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
@@ -207,14 +270,10 @@ TEST(VP9DecodeMTTest, MTDecode3) {
      "ae96f21f21b6370cc0125621b441fc52" },
    { "vp90-2-14-resize-fp-tiles-8-4.webm",
      "3eb4f24f10640d42218f7fd7b9fd30d4" },
+    { NULL, NULL }
  };

-  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
-          << "threads = " << t;
-    }
-  }
+  DecodeFiles(files);
 }
 #endif  // CONFIG_WEBM_IO

--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "./y4menc.h"
+
+namespace {
+
+using std::string;
+using std::tr1::make_tuple;
+
+static const unsigned int kWidth  = 160;
+static const unsigned int kHeight = 90;
+static const unsigned int kFrames = 10;
+
+typedef std::tr1::tuple<const char *, const unsigned int,
+        const vpx_img_fmt, const char *> test_entry_type;
+
+static const test_entry_type kY4mTestVectors[] = {
+  make_tuple("park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420,
+             "e5406275b9fc6bb3436c31d4a05c1cab"),
+  make_tuple("park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422,
+             "284a47a47133b12884ec3a14e959a0b6"),
+  make_tuple("park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444,
+             "90517ff33843d85de712fd4fe60dbed0"),
+  make_tuple("park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016,
+             "63f21f9f717d8b8631bd2288ee87137b"),
+  make_tuple("park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216,
+             "48ab51fb540aed07f7ff5af130c9b605"),
+  make_tuple("park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416,
+             "067bfd75aa85ff9bae91fa3e0edd1e3e"),
+  make_tuple("park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016,
+             "9e6d8f6508c6e55625f6b697bc461cef"),
+  make_tuple("park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216,
+             "b239c6b301c0b835485be349ca83a7e3"),
+  make_tuple("park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416,
+             "5a6481a550821dab6d0192f5c63845e9")
+};
+
+static void write_image_file(const vpx_image_t *img, FILE *file) {
+  int plane, y;
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+    const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
+                   img->y_chroma_shift : img->d_h);
+    const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                   img->x_chroma_shift : img->d_w);
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, bytes_per_sample, w, file);
+      buf += stride;
+    }
+  }
+}
+
+class Y4mVideoSourceTest
+    : public ::testing::TestWithParam<test_entry_type>,
+      public ::libvpx_test::Y4mVideoSource {
+ protected:
+  Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
+
+  virtual ~Y4mVideoSourceTest() {
+    CloseSource();
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    file_name_ = file_name;
+    start_ = 0;
+    limit_ = limit;
+    frame_ = 0;
+    Begin();
+  }
+
+  // Checks y4m header information
+  void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_EQ(y4m_.pic_w, (int)kWidth);
+    ASSERT_EQ(y4m_.pic_h, (int)kHeight);
+    ASSERT_EQ(img()->d_w, kWidth);
+    ASSERT_EQ(img()->d_h, kHeight);
+    ASSERT_EQ(y4m_.bit_depth, bit_depth);
+    ASSERT_EQ(y4m_.vpx_fmt, fmt);
+    if (fmt == VPX_IMG_FMT_I420 || fmt == VPX_IMG_FMT_I42016) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 1U);
+    }
+    if (fmt == VPX_IMG_FMT_I422 || fmt == VPX_IMG_FMT_I42216) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+    if (fmt == VPX_IMG_FMT_I444 || fmt == VPX_IMG_FMT_I44416) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
+      ASSERT_EQ(img()->x_chroma_shift, 0U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+  }
+
+  // Checks MD5 of the raw frame data
+  void Md5Check(const string &expected_md5) {
+    ASSERT_TRUE(input_file_ != NULL);
+    libvpx_test::MD5 md5;
+    for (unsigned int i = start_; i < limit_; i++) {
+      md5.Add(img());
+      Next();
+    }
+    ASSERT_EQ(string(md5.Get()), expected_md5);
+  }
+};
+
+TEST_P(Y4mVideoSourceTest, SourceTest) {
+  const char *filename = GET_PARAM(0);
+  const unsigned int bit_depth = GET_PARAM(1);
+  const vpx_img_fmt format = GET_PARAM(2);
+  const char *md5raw = GET_PARAM(3);
+
+  Init(filename, kFrames);
+  HeaderChecks(bit_depth, format);
+  Md5Check(md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+class Y4mVideoWriteTest
+    : public Y4mVideoSourceTest {
+ protected:
+  Y4mVideoWriteTest() : Y4mVideoSourceTest() {}
+
+  virtual void ReplaceInputFp(FILE *input_file) {
+    CloseSource();
+    frame_ = 0;
+    input_file_ = input_file;
+    rewind(input_file_);
+    ReadSourceToStart();
+  }
+
+  // Writes out a y4m file and then reads it back
+  void WriteY4mAndReadBack() {
+    ASSERT_TRUE(input_file_ != NULL);
+    char buf[Y4M_BUFFER_SIZE] = {0};
+    const struct VpxRational framerate = {y4m_.fps_n, y4m_.fps_d};
+    FILE *out_file = libvpx_test::OpenTempOutFile();
+    ASSERT_TRUE(out_file != NULL);
+    y4m_write_file_header(buf, sizeof(buf),
+                          kWidth, kHeight,
+                          &framerate, y4m_.vpx_fmt,
+                          y4m_.bit_depth);
+    fputs(buf, out_file);
+    for (unsigned int i = start_; i < limit_; i++) {
+      y4m_write_frame_header(buf, sizeof(buf));
+      fputs(buf, out_file);
+      write_image_file(img(), out_file);
+      Next();
+    }
+    ReplaceInputFp(out_file);
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    Y4mVideoSourceTest::Init(file_name, limit);
+    WriteY4mAndReadBack();
+  }
+};
+
+TEST_P(Y4mVideoWriteTest, WriteTest) {
+  const char *filename = GET_PARAM(0);
+  const unsigned int bit_depth = GET_PARAM(1);
+  const vpx_img_fmt format = GET_PARAM(2);
+  const char *md5raw = GET_PARAM(3);
+
+  Init(filename, kFrames);
+  HeaderChecks(bit_depth, format);
+  Md5Check(md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+}  // namespace
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -38,24 +38,30 @@ class Y4mVideoSource : public VideoSource {
    CloseSource();
  }

-  virtual void Begin() {
+  virtual void OpenSource() {
    CloseSource();
    input_file_ = OpenTestDataFile(file_name_);
    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-        << file_name_;
+                                     << file_name_;
+  }

-    y4m_input_open(&y4m_, input_file_, NULL, 0, 0);
+  virtual void ReadSourceToStart() {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
    framerate_numerator_ = y4m_.fps_n;
    framerate_denominator_ = y4m_.fps_d;
-
    frame_ = 0;
    for (unsigned int i = 0; i < start_; i++) {
-        Next();
+      Next();
    }
-
    FillFrame();
  }

+  virtual void Begin() {
+    OpenSource();
+    ReadSourceToStart();
+  }
+
  virtual void Next() {
    ++frame_;
    FillFrame();
--- a/third_party/libmkv/EbmlIDs.h
+++ b/third_party/libmkv/EbmlIDs.h
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef MKV_DEFS_HPP
+#define MKV_DEFS_HPP 1
+
+/* Commenting out values not available in webm, but available in matroska */
+
+enum mkv {
+  EBML = 0x1A45DFA3,
+  EBMLVersion = 0x4286,
+  EBMLReadVersion = 0x42F7,
+  EBMLMaxIDLength = 0x42F2,
+  EBMLMaxSizeLength = 0x42F3,
+  DocType = 0x4282,
+  DocTypeVersion = 0x4287,
+  DocTypeReadVersion = 0x4285,
+/* CRC_32 = 0xBF, */
+  Void = 0xEC,
+  SignatureSlot = 0x1B538667,
+  SignatureAlgo = 0x7E8A,
+  SignatureHash = 0x7E9A,
+  SignaturePublicKey = 0x7EA5,
+  Signature = 0x7EB5,
+  SignatureElements = 0x7E5B,
+  SignatureElementList = 0x7E7B,
+  SignedElement = 0x6532,
+  /* segment */
+  Segment = 0x18538067,
+  /* Meta Seek Information */
+  SeekHead = 0x114D9B74,
+  Seek = 0x4DBB,
+  SeekID = 0x53AB,
+  SeekPosition = 0x53AC,
+  /* Segment Information */
+  Info = 0x1549A966,
+/* SegmentUID = 0x73A4, */
+/* SegmentFilename = 0x7384, */
+/* PrevUID = 0x3CB923, */
+/* PrevFilename = 0x3C83AB, */
+/* NextUID = 0x3EB923, */
+/* NextFilename = 0x3E83BB, */
+/* SegmentFamily = 0x4444, */
+/* ChapterTranslate = 0x6924, */
+/* ChapterTranslateEditionUID = 0x69FC, */
+/* ChapterTranslateCodec = 0x69BF, */
+/* ChapterTranslateID = 0x69A5, */
+  TimecodeScale = 0x2AD7B1,
+  Segment_Duration = 0x4489,
+  DateUTC = 0x4461,
+/* Title = 0x7BA9, */
+  MuxingApp = 0x4D80,
+  WritingApp = 0x5741,
+  /* Cluster */
+  Cluster = 0x1F43B675,
+  Timecode = 0xE7,
+/* SilentTracks = 0x5854, */
+/* SilentTrackNumber = 0x58D7, */
+/* Position = 0xA7, */
+  PrevSize = 0xAB,
+  BlockGroup = 0xA0,
+  Block = 0xA1,
+/* BlockVirtual = 0xA2, */
+  BlockAdditions = 0x75A1,
+  BlockMore = 0xA6,
+  BlockAddID = 0xEE,
+  BlockAdditional = 0xA5,
+  BlockDuration = 0x9B,
+/* ReferencePriority = 0xFA, */
+  ReferenceBlock = 0xFB,
+/* ReferenceVirtual = 0xFD, */
+/* CodecState = 0xA4, */
+/* Slices = 0x8E, */
+/* TimeSlice = 0xE8, */
+  LaceNumber = 0xCC,
+/* FrameNumber = 0xCD, */
+/* BlockAdditionID = 0xCB, */
+/* MkvDelay = 0xCE, */
+/* Cluster_Duration = 0xCF, */
+  SimpleBlock = 0xA3,
+/* EncryptedBlock = 0xAF, */
+  /* Track */
+  Tracks = 0x1654AE6B,
+  TrackEntry = 0xAE,
+  TrackNumber = 0xD7,
+  TrackUID = 0x73C5,
+  TrackType = 0x83,
+  FlagEnabled = 0xB9,
+  FlagDefault = 0x88,
+  FlagForced = 0x55AA,
+  FlagLacing = 0x9C,
+/* MinCache = 0x6DE7, */
+/* MaxCache = 0x6DF8, */
+  DefaultDuration = 0x23E383,
+/* TrackTimecodeScale = 0x23314F, */
+/* TrackOffset = 0x537F, */
+  MaxBlockAdditionID = 0x55EE,
+  Name = 0x536E,
+  Language = 0x22B59C,
+  CodecID = 0x86,
+  CodecPrivate = 0x63A2,
+  CodecName = 0x258688,
+/* AttachmentLink = 0x7446, */
+/* CodecSettings = 0x3A9697, */
+/* CodecInfoURL = 0x3B4040, */
+/* CodecDownloadURL = 0x26B240, */
+/* CodecDecodeAll = 0xAA, */
+/* TrackOverlay = 0x6FAB, */
+/* TrackTranslate = 0x6624, */
+/* TrackTranslateEditionUID = 0x66FC, */
+/* TrackTranslateCodec = 0x66BF, */
+/* TrackTranslateTrackID = 0x66A5, */
+  /* video */
+  Video = 0xE0,
+  FlagInterlaced = 0x9A,
+  StereoMode = 0x53B8,
+  AlphaMode = 0x53C0,
+  PixelWidth = 0xB0,
+  PixelHeight = 0xBA,
+  PixelCropBottom = 0x54AA,
+  PixelCropTop = 0x54BB,
+  PixelCropLeft = 0x54CC,
+  PixelCropRight = 0x54DD,
+  DisplayWidth = 0x54B0,
+  DisplayHeight = 0x54BA,
+  DisplayUnit = 0x54B2,
+  AspectRatioType = 0x54B3,
+/* ColourSpace = 0x2EB524, */
+/* GammaValue = 0x2FB523, */
+  FrameRate = 0x2383E3,
+  /* end video */
+  /* audio */
+  Audio = 0xE1,
+  SamplingFrequency = 0xB5,
+  OutputSamplingFrequency = 0x78B5,
+  Channels = 0x9F,
+/* ChannelPositions = 0x7D7B, */
+  BitDepth = 0x6264,
+  /* end audio */
+  /* content encoding */
+/* ContentEncodings = 0x6d80, */
+/* ContentEncoding = 0x6240, */
+/* ContentEncodingOrder = 0x5031, */
+/* ContentEncodingScope = 0x5032, */
+/* ContentEncodingType = 0x5033, */
+/* ContentCompression = 0x5034, */
+/* ContentCompAlgo = 0x4254, */
+/* ContentCompSettings = 0x4255, */
+/* ContentEncryption = 0x5035, */
+/* ContentEncAlgo = 0x47e1, */
+/* ContentEncKeyID = 0x47e2, */
+/* ContentSignature = 0x47e3, */
+/* ContentSigKeyID = 0x47e4, */
+/* ContentSigAlgo = 0x47e5, */
+/* ContentSigHashAlgo = 0x47e6, */
+  /* end content encoding */
+  /* Cueing Data */
+  Cues = 0x1C53BB6B,
+  CuePoint = 0xBB,
+  CueTime = 0xB3,
+  CueTrackPositions = 0xB7,
+  CueTrack = 0xF7,
+  CueClusterPosition = 0xF1,
+  CueBlockNumber = 0x5378
+/* CueCodecState = 0xEA, */
+/* CueReference = 0xDB, */
+/* CueRefTime = 0x96, */
+/* CueRefCluster = 0x97, */
+/* CueRefNumber = 0x535F, */
+/* CueRefCodecState = 0xEB, */
+  /* Attachment */
+/* Attachments = 0x1941A469, */
+/* AttachedFile = 0x61A7, */
+/* FileDescription = 0x467E, */
+/* FileName = 0x466E, */
+/* FileMimeType = 0x4660, */
+/* FileData = 0x465C, */
+/* FileUID = 0x46AE, */
+/* FileReferral = 0x4675, */
+  /* Chapters */
+/* Chapters = 0x1043A770, */
+/* EditionEntry = 0x45B9, */
+/* EditionUID = 0x45BC, */
+/* EditionFlagHidden = 0x45BD, */
+/* EditionFlagDefault = 0x45DB, */
+/* EditionFlagOrdered = 0x45DD, */
+/* ChapterAtom = 0xB6, */
+/* ChapterUID = 0x73C4, */
+/* ChapterTimeStart = 0x91, */
+/* ChapterTimeEnd = 0x92, */
+/* ChapterFlagHidden = 0x98, */
+/* ChapterFlagEnabled = 0x4598, */
+/* ChapterSegmentUID = 0x6E67, */
+/* ChapterSegmentEditionUID = 0x6EBC, */
+/* ChapterPhysicalEquiv = 0x63C3, */
+/* ChapterTrack = 0x8F, */
+/* ChapterTrackNumber = 0x89, */
+/* ChapterDisplay = 0x80, */
+/* ChapString = 0x85, */
+/* ChapLanguage = 0x437C, */
+/* ChapCountry = 0x437E, */
+/* ChapProcess = 0x6944, */
+/* ChapProcessCodecID = 0x6955, */
+/* ChapProcessPrivate = 0x450D, */
+/* ChapProcessCommand = 0x6911, */
+/* ChapProcessTime = 0x6922, */
+/* ChapProcessData = 0x6933, */
+  /* Tagging */
+/* Tags = 0x1254C367, */
+/* Tag = 0x7373, */
+/* Targets = 0x63C0, */
+/* TargetTypeValue = 0x68CA, */
+/* TargetType = 0x63CA, */
+/* Tagging_TrackUID = 0x63C5, */
+/* Tagging_EditionUID = 0x63C9, */
+/* Tagging_ChapterUID = 0x63C4, */
+/* AttachmentUID = 0x63C6, */
+/* SimpleTag = 0x67C8, */
+/* TagName = 0x45A3, */
+/* TagLanguage = 0x447A, */
+/* TagDefault = 0x4484, */
+/* TagString = 0x4487, */
+/* TagBinary = 0x4485, */
+};
+#endif
--- a/third_party/libmkv/EbmlWriter.c
+++ b/third_party/libmkv/EbmlWriter.c
@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "EbmlWriter.h"
+#include <stdlib.h>
+#include <wchar.h>
+#include <string.h>
+#include <limits.h>
+#if defined(_MSC_VER)
+#define LITERALU64(n) n
+#else
+#define LITERALU64(n) n##LLU
+#endif
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
+  /* TODO check and make sure we are not > than 0x0100000000000000LLU */
+  unsigned char size = 8; /* size in bytes to output */
+
+  /* mask to compare for byte size */
+  int64_t minVal = 0xff;
+
+  for (size = 1; size < 8; size ++) {
+    if (val < minVal)
+      break;
+
+    minVal = (minVal << 7);
+  }
+
+  val |= (((uint64_t)0x80) << ((size - 1) * 7));
+
+  Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
+}
+
+void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
+  const size_t size_ = strlen(str);
+  const uint64_t  size = size_;
+  Ebml_WriteLen(glob, size);
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we omit the null terminator.
+   */
+  Ebml_Write(glob, str, (unsigned long)size);
+}
+
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
+  const size_t strlen = wcslen(wstr);
+
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we include it.
+   */
+  const uint64_t  size = strlen;
+
+  Ebml_WriteLen(glob, size);
+  Ebml_Write(glob, wstr, (unsigned long)size);
+}
+
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
+  int len;
+
+  if (class_id >= 0x01000000)
+    len = 4;
+  else if (class_id >= 0x00010000)
+    len = 3;
+  else if (class_id >= 0x00000100)
+    len = 2;
+  else
+    len = 1;
+
+  Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
+}
+
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
+  unsigned char sizeSerialized = 8 | 0x80;
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), 8);
+}
+
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
+  unsigned char size = 8; /* size in bytes to output */
+  unsigned char sizeSerialized = 0;
+  unsigned long minVal;
+
+  Ebml_WriteID(glob, class_id);
+  minVal = 0x7fLU; /* mask to compare for byte size */
+
+  for (size = 1; size < 4; size ++) {
+    if (ui < minVal) {
+      break;
+    }
+
+    minVal <<= 7;
+  }
+
+  sizeSerialized = 0x80 | size;
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), size);
+}
+/* TODO: perhaps this is a poor name for this id serializer helper function */
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
+  int size;
+  for (size = 4; size > 1; size--) {
+    if (bin & (unsigned int)0x000000ff << ((size - 1) * 8))
+      break;
+  }
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, size);
+  Ebml_WriteID(glob, bin);
+}
+
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) {
+  unsigned char len = 0x88;
+
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &len, sizeof(len), 1);
+  Ebml_Serialize(glob,  &d, sizeof(d), 8);
+}
+
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val) {
+  signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
+  Ebml_Serialize(glob, &out, sizeof(out), 3);
+}
+
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteString(glob, s);
+}
+
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) {
+  Ebml_WriteID(glob,  class_id);
+  Ebml_WriteUTF8(glob,  s);
+}
+
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, data_length);
+  Ebml_Write(glob,  data, data_length);
+}
+
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
+  unsigned char tmp = 0;
+  unsigned long i = 0;
+
+  Ebml_WriteID(glob, 0xEC);
+  Ebml_WriteLen(glob, vSize);
+
+  for (i = 0; i < vSize; i++) {
+    Ebml_Write(glob, &tmp, 1);
+  }
+}
+
+/* TODO Serialize Date */
--- a/third_party/libmkv/EbmlWriter.h
+++ b/third_party/libmkv/EbmlWriter.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef EBMLWRITER_HPP
+#define EBMLWRITER_HPP
+#include <stddef.h>
+#include "vpx/vpx_integer.h"
+
+/* note: you must define write and serialize functions as well as your own
+ * EBML_GLOBAL
+ *
+ * These functions MUST be implemented
+ */
+
+typedef struct EbmlGlobal EbmlGlobal;
+void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
+void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
+
+/*****/
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
+void Ebml_WriteString(EbmlGlobal *glob, const char *str);
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui);
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
+/* TODO make this more generic to signed */
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
+/* TODO need date function */
+#endif
--- a/tools_common.h
+++ b/tools_common.h
@@ -90,6 +90,7 @@ struct VpxInputContext {
  uint32_t width;
  uint32_t height;
  vpx_img_fmt_t fmt;
+  vpx_bit_depth_t bit_depth;
  int only_i420;
  uint32_t fourcc;
  struct VpxRational framerate;
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -393,12 +393,12 @@ void vp8_de_noise(VP8_COMMON                 *cm,
                  int                         low_var_thresh,
                  int                         flag)
 {
+    int mbr;
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
-    int mb_rows = source->y_width >> 4;
-    int mb_cols = source->y_height >> 4;
+    int mb_rows = cm->mb_rows;
+    int mb_cols = cm->mb_cols;
    unsigned char *limits = cm->pp_limits_buffer;;
-    int mbr, mbc;
    (void) post;
    (void) low_var_thresh;
    (void) flag;
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -463,9 +463,7 @@ $vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2/;
-# TODO(johann) Update sse4 implementation and re-enable
-#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1/;

 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
@@ -554,6 +552,9 @@ $vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
    add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
    specialize qw/vp8_denoiser_filter sse2 neon/;
+    add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+    specialize qw/vp8_denoiser_filter_uv sse2/;
+
 }

 # End of encoder only functions
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -18,18 +18,18 @@ extern "C" {

 #if HAVE_EDSP
 void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
-                             const vp8_token *,
+                             vp8_token *,
                             const vp8_extra_bit_struct *,
                             const vp8_tree_index *);
 void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
                                             unsigned char * cx_data,
                                             const unsigned char *cx_data_end,
                                             int num_parts,
-                                             const vp8_token *,
+                                             vp8_token *,
                                             const vp8_extra_bit_struct *,
                                             const vp8_tree_index *);
 void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
-                                    const vp8_token *,
+                                    vp8_token *,
                                    const vp8_extra_bit_struct *,
                                    const vp8_tree_index *);
 # define pack_tokens(a,b,c)                  \
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -191,6 +191,148 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
    return FILTER_BLOCK;
 }

+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
+                             int mc_avg_uv_stride,
+                             unsigned char *running_avg_uv,
+                             int avg_uv_stride,
+                             unsigned char *sig,
+                             int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_uv_start = running_avg_uv;
+    unsigned char *sig_start = sig;
+    int sum_diff_thresh;
+    int r, c;
+    int sum_diff = 0;
+    int sum_block = 0;
+    int adj_val[3] = {3, 4, 6};
+    int shift_inc1 = 0;
+    int shift_inc2 = 1;
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. Add another increment for
+     * blocks that are labeled for increase denoising. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
+      if (increase_denoising) {
+        shift_inc1 = 1;
+        shift_inc2 = 2;
+      }
+      adj_val[0] += shift_inc2;
+      adj_val[1] += shift_inc2;
+      adj_val[2] += shift_inc2;
+    }
+
+    // Avoid denoising color signal if its close to average level.
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        sum_block += sig[c];
+      }
+      sig += sig_stride;
+    }
+    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+      return COPY_BLOCK;
+    }
+
+    sig -= sig_stride * 8;
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        int diff = 0;
+        int adjustment = 0;
+        int absdiff = 0;
+
+        diff = mc_running_avg_uv[c] - sig[c];
+        absdiff = abs(diff);
+
+        // When |diff| <= |3 + shift_inc1|, use pixel value from
+        // last denoised raw.
+        if (absdiff <= 3 + shift_inc1) {
+          running_avg_uv[c] = mc_running_avg_uv[c];
+          sum_diff += diff;
+        } else {
+          if (absdiff >= 4 && absdiff <= 7)
+            adjustment = adj_val[0];
+          else if (absdiff >= 8 && absdiff <= 15)
+            adjustment = adj_val[1];
+          else
+            adjustment = adj_val[2];
+          if (diff > 0) {
+            if ((sig[c] + adjustment) > 255)
+              running_avg_uv[c] = 255;
+            else
+              running_avg_uv[c] = sig[c] + adjustment;
+            sum_diff += adjustment;
+          } else {
+            if ((sig[c] - adjustment) < 0)
+              running_avg_uv[c] = 0;
+            else
+              running_avg_uv[c] = sig[c] - adjustment;
+            sum_diff -= adjustment;
+          }
+        }
+      }
+      /* Update pointers for next iteration. */
+      sig += sig_stride;
+      mc_running_avg_uv += mc_avg_uv_stride;
+      running_avg_uv += avg_uv_stride;
+    }
+
+    sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
+    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising), check
+      // if we can still apply some (weaker) temporal filtering to this block,
+      // that would otherwise not be denoised at all. Simplest is to apply
+      // an additional adjustment to running_avg_y to bring it closer to sig.
+      // The adjustment is capped by a maximum delta, and chosen such that
+      // in most cases the resulting sum_diff will be within the
+      // accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over threshold.
+      int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        sig -= sig_stride * 8;
+        mc_running_avg_uv -= mc_avg_uv_stride * 8;
+        running_avg_uv -= avg_uv_stride * 8;
+        for (r = 0; r < 8; ++r) {
+          for (c = 0; c < 8; ++c) {
+            int diff = mc_running_avg_uv[c] - sig[c];
+            int adjustment = abs(diff);
+            if (adjustment > delta)
+              adjustment = delta;
+            if (diff > 0) {
+              // Bring denoised signal down.
+              if (running_avg_uv[c] - adjustment < 0)
+                running_avg_uv[c] = 0;
+              else
+                running_avg_uv[c] = running_avg_uv[c] - adjustment;
+              sum_diff -= adjustment;
+            } else if (diff < 0) {
+              // Bring denoised signal up.
+              if (running_avg_uv[c] + adjustment > 255)
+                running_avg_uv[c] = 255;
+              else
+                running_avg_uv[c] = running_avg_uv[c] + adjustment;
+              sum_diff += adjustment;
+            }
+          }
+          // TODO(marpan): Check here if abs(sum_diff) has gone below the
+          // threshold sum_diff_thresh, and if so, we can exit the row loop.
+          sig += sig_stride;
+          mc_running_avg_uv += mc_avg_uv_stride;
+          running_avg_uv += avg_uv_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh)
+          return COPY_BLOCK;
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+
+    vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
+                    sig_stride);
+    return FILTER_BLOCK;
+}
+
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
                          int num_mb_rows, int num_mb_cols)
 {
@@ -241,6 +383,7 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
        vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
    }
    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+    vpx_free(denoiser->denoise_state);
 }


@@ -260,6 +403,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
    unsigned int motion_magnitude2;
    unsigned int sse_thresh;
    int sse_diff_thresh = 0;
+    // Denoise the UV channel.
+    int apply_color_denoise = 0;
    // Spatial loop filter: only applied selectively based on
    // temporal filter state of block relative to top/left neighbors.
    int apply_spatial_loop_filter = 1;
@@ -267,6 +412,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
    MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;

    enum vp8_denoiser_decision decision = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_u = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_v = FILTER_BLOCK;

    if (zero_frame)
    {
@@ -376,11 +523,37 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,

        /* Filter. */
        decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
-                                         running_avg_y, avg_y_stride,
-                                         x->thismb, 16, motion_magnitude2,
-                                         x->increase_denoising);
+                                       running_avg_y, avg_y_stride,
+                                       x->thismb, 16, motion_magnitude2,
+                                       x->increase_denoising);
        denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
            kFilterNonZeroMV : kFilterZeroMV;
+        // Only denoise UV for zero motion, and if y channel was denoised.
+        if (apply_color_denoise &&
+            motion_magnitude2 == 0 &&
+            decision == FILTER_BLOCK) {
+          unsigned char *mc_running_avg_u =
+              denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
+          unsigned char *running_avg_u =
+              denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
+          unsigned char *mc_running_avg_v =
+              denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
+          unsigned char *running_avg_v =
+              denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
+          int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
+          int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
+          int signal_stride = x->block[16].src_stride;
+          decision_u =
+              vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
+                                      running_avg_u, avg_uv_stride,
+                                      x->block[16].src + *x->block[16].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+          decision_v =
+              vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
+                                      running_avg_v, avg_uv_stride,
+                                      x->block[20].src + *x->block[20].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+        }
    }
    if (decision == COPY_BLOCK)
    {
@@ -393,7 +566,21 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
        denoiser->denoise_state[block_index] = kNoFilter;
    }
-    // Option to selectively deblock the denoised signal.
+    if (apply_color_denoise) {
+      if (decision_u == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+      if (decision_v == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+    }
+    // Option to selectively deblock the denoised signal, for y channel only.
    if (apply_spatial_loop_filter) {
      loop_filter_info lfi;
      int apply_filter_col = 0;
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -22,6 +22,11 @@ extern "C" {
 #define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)

+#define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
+#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
+#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
+
 enum vp8_denoiser_decision
 {
  COPY_BLOCK,
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -17,10 +17,23 @@
 #include <emmintrin.h>
 #include "vpx_ports/emmintrin_compat.h"

-union sum_union {
-    __m128i v;
-    signed char e[16];
-};
+/* Compute the sum of all pixel differences of this MB. */
+static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo = _mm_srai_epi16(
+      _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi = _mm_srai_epi16(
+      _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                          _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                         _mm_srli_si128(hgfe_dcba, 4));
+  unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba);
+
+  return abs(sum_diff);
+}

 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
                             int mc_avg_y_stride,
@@ -31,7 +44,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
 {
    unsigned char *running_avg_y_start = running_avg_y;
    unsigned char *sig_start = sig;
-    int sum_diff_thresh;
+    unsigned int sum_diff_thresh;
    int r;
    int shift_inc  = (increase_denoising &&
        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
@@ -103,16 +116,10 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

    {
        /* Compute the sum of all pixel differences of this MB. */
-        union sum_union s;
-        int sum_diff = 0;
-        s.v = acc_diff;
-        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
-                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
-                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];
-
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
        sum_diff_thresh = SUM_DIFF_THRESHOLD;
        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
-        if (abs(sum_diff) > sum_diff_thresh) {
+        if (abs_sum_diff > sum_diff_thresh) {
          // Before returning to copy the block (i.e., apply no denoising),
          // checK if we can still apply some (weaker) temporal filtering to
          // this block, that would otherwise not be denoised at all. Simplest
@@ -123,7 +130,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

          // The delta is set by the excess of absolute pixel diff over the
          // threshold.
-          int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
          // Only apply the adjustment for max delta up to 3.
          if (delta < 4) {
            const __m128i k_delta = _mm_set1_epi8(delta);
@@ -162,16 +169,9 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
             mc_running_avg_y += mc_avg_y_stride;
             running_avg_y += avg_y_stride;
            }
-            {
-              // Update the sum of all pixel differences of this MB.
-              union sum_union s;
-              s.v = acc_diff;
-              sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
-                       + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
-                       + s.e[12] + s.e[13] + s.e[14] + s.e[15];
-              if (abs(sum_diff) > sum_diff_thresh) {
-                return COPY_BLOCK;
-              }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
            }
          } else {
            return COPY_BLOCK;
@@ -182,3 +182,198 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
    vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
    return FILTER_BLOCK;
 }
+
+int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
+                             int mc_avg_stride,
+                             unsigned char *running_avg, int avg_stride,
+                             unsigned char *sig, int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_start = running_avg;
+    unsigned char *sig_start = sig;
+    unsigned int sum_diff_thresh;
+    int r;
+    int shift_inc  = (increase_denoising &&
+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
+    __m128i acc_diff = _mm_setzero_si128();
+    const __m128i k_0 = _mm_setzero_si128();
+    const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+    const __m128i k_8 = _mm_set1_epi8(8);
+    const __m128i k_16 = _mm_set1_epi8(16);
+    /* Modify each level's adjustment according to motion_magnitude. */
+    const __m128i l3 = _mm_set1_epi8(
+                       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ?
+                        7 + shift_inc : 6);
+    /* Difference between level 3 and level 2 is 2. */
+    const __m128i l32 = _mm_set1_epi8(2);
+    /* Difference between level 2 and level 1 is 1. */
+    const __m128i l21 = _mm_set1_epi8(1);
+
+    {
+      const __m128i k_1 = _mm_set1_epi16(1);
+      __m128i vec_sum_block = _mm_setzero_si128();
+
+      // Avoid denoising color signal if its close to average level.
+      for (r = 0; r < 8; ++r) {
+        const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
+        const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
+        vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
+        sig += sig_stride;
+      }
+      sig -= sig_stride * 8;
+      {
+        const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
+        const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                                _mm_srli_si128(hg_fe_dc_ba, 8));
+        const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                               _mm_srli_si128(hgfe_dcba, 4));
+        const int sum_block = _mm_cvtsi128_si32(hgfedcba);
+        if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+          return COPY_BLOCK;
+        }
+      }
+    }
+
+    for (r = 0; r < 4; ++r) {
+        /* Calculate differences */
+        const __m128i v_sig_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&sig[0])));
+        const __m128i v_sig = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                         (double *)(&sig[sig_stride])));
+        const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&mc_running_avg[0])));
+        const __m128i v_mc_running_avg = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                         (double *)(&mc_running_avg[mc_avg_stride])));
+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+        /* Obtain the sign. FF if diff is negative. */
+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+        /* Clamp absolute difference to 16 to be used to get mask. Doing this
+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+        const __m128i clamped_absdiff = _mm_min_epu8(
+                                        _mm_or_si128(pdiff, ndiff), k_16);
+        /* Get masks for l2 l1 and l0 adjustments */
+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+        /* Get adjustments for l2, l1, and l0 */
+        __m128i adj2 = _mm_and_si128(mask2, l32);
+        const __m128i adj1 = _mm_and_si128(mask1, l21);
+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+        __m128i adj,  padj, nadj;
+        __m128i v_running_avg;
+
+        /* Combine the adjustments and get absolute adjustments. */
+        adj2 = _mm_add_epi8(adj2, adj1);
+        adj = _mm_sub_epi8(l3, adj2);
+        adj = _mm_andnot_si128(mask0, adj);
+        adj = _mm_or_si128(adj, adj0);
+
+        /* Restore the sign and get positive and negative adjustments. */
+        padj = _mm_andnot_si128(diff_sign, adj);
+        nadj = _mm_and_si128(diff_sign, adj);
+
+        /* Calculate filtered value. */
+        v_running_avg = _mm_adds_epu8(v_sig, padj);
+        v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
+
+        _mm_storel_pd((double *)&running_avg[0],
+                      _mm_castsi128_pd(v_running_avg));
+        _mm_storeh_pd((double *)&running_avg[avg_stride],
+                      _mm_castsi128_pd(v_running_avg));
+
+        /* Adjustments <=7, and each element in acc_diff can fit in signed
+         * char.
+         */
+        acc_diff = _mm_adds_epi8(acc_diff, padj);
+        acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride * 2;
+        mc_running_avg += mc_avg_stride * 2;
+        running_avg += avg_stride * 2;
+    }
+
+    {
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+        if (abs_sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // checK if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the accceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const __m128i k_delta = _mm_set1_epi8(delta);
+            sig -= sig_stride * 8;
+            mc_running_avg -= mc_avg_stride * 8;
+            running_avg -= avg_stride * 8;
+            for (r = 0; r < 4; ++r) {
+              // Calculate differences.
+              const __m128i v_sig_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&sig[0])));
+              const __m128i v_sig = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                               (double *)(&sig[sig_stride])));
+              const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&mc_running_avg[0])));
+              const __m128i v_mc_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                               (double *)(&mc_running_avg[mc_avg_stride])));
+              const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+              const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+              // Obtain the sign. FF if diff is negative.
+              const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+              // Clamp absolute difference to delta to get the adjustment.
+              const __m128i adj =
+                  _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+              // Restore the sign and get positive and negative adjustments.
+              __m128i padj, nadj;
+              const __m128i v_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&running_avg[0])));
+              __m128i v_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
+                               (double *)(&running_avg[avg_stride])));
+              padj = _mm_andnot_si128(diff_sign, adj);
+              nadj = _mm_and_si128(diff_sign, adj);
+              // Calculate filtered value.
+              v_running_avg = _mm_subs_epu8(v_running_avg, padj);
+              v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
+
+              _mm_storel_pd((double *)&running_avg[0],
+                            _mm_castsi128_pd(v_running_avg));
+              _mm_storeh_pd((double *)&running_avg[avg_stride],
+                            _mm_castsi128_pd(v_running_avg));
+
+             // Accumulate the adjustments.
+             acc_diff = _mm_subs_epi8(acc_diff, padj);
+             acc_diff = _mm_adds_epi8(acc_diff, nadj);
+
+             // Update pointers for next iteration.
+             sig += sig_stride * 2;
+             mc_running_avg += mc_avg_stride * 2;
+             running_avg += avg_stride * 2;
+            }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
+            }
+          } else {
+            return COPY_BLOCK;
+          }
+        }
+    }
+
+    vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
+    return FILTER_BLOCK;
+}
--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -26,11 +26,10 @@
        int cmp = (x[z] < boost) | (y[z] == 0); \
        zbin_boost_ptr++; \
        if (cmp) \
-            goto select_eob_end_##i; \
+            break; \
        qcoeff_ptr[z] = y[z]; \
        eob = i; \
        zbin_boost_ptr = b->zrun_zbin_boost; \
-        select_eob_end_##i:; \
    } while (0)

 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -1,256 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse4) PRIVATE
-sym(vp8_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_zbin]
-    mov         rdx, [rdi + vp8_block_round]
-    movd        xmm7, [rdi + vp8_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp8_block_quant_shift]
-    mov         rcx, [rdi + vp8_block_quant]
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp8_blockd_dequant]
-    mov         rdi, [rsi + vp8_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp8/common/entropy.c: vp8_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vp8_rtcd.h"
+#include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#define SELECT_EOB(i, z, x, y, q) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        short x_z = _mm_extract_epi16(x, z); \
+        short y_z = _mm_extract_epi16(y, z); \
+        int cmp = (x_z < boost) | (y_z == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            break; \
+        q = _mm_insert_epi16(q, y_z, z); \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1,
+            dqcoeff0, dqcoeff1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+    __m128i qcoeff0 = _mm_setzero_si128();
+    __m128i qcoeff1 = _mm_setzero_si128();
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+
+    _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
+    _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
+
+    dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0);
+    dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
+
+    *d->eob = eob;
+}
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,6 +89,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c

 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
@@ -97,7 +98,6 @@ endif
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -25,12 +25,14 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;

-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_c(src, src_stride,
+                    dst, dst_stride,
+                    filter_x, x_step_q4,
+                    filter_y, y_step_q4,
+                    w, h);
+    return;
+  }

  /* Filter starting 3 lines back. The neon implementation will ignore the
   * given height and filter a multiple of 4 lines. Since this goes in to
@@ -57,12 +59,14 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
  int intermediate_height = h + 7;

-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_avg_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_avg_c(src, src_stride,
+                        dst, dst_stride,
+                        filter_x, x_step_q4,
+                        filter_y, y_step_q4,
+                        w, h);
+    return;
+  }

  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -9,6 +9,7 @@
 */

 #include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"

 void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                    const uint8_t *blimit0,
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -97,19 +97,22 @@ static void free_mi(VP9_COMMON *cm) {

 void vp9_free_frame_buffers(VP9_COMMON *cm) {
  int i;
+  BufferPool *const pool = cm->buffer_pool;

  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
+    vp9_free_frame_buffer(&pool->frame_bufs[i].buf);

-    if (cm->frame_bufs[i].ref_count > 0 &&
-        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
-      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
-      cm->frame_bufs[i].ref_count = 0;
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
    }
  }

  vp9_free_frame_buffer(&cm->post_proc_buffer);
+}

+void vp9_free_context_buffers(VP9_COMMON *cm) {
  free_mi(cm);

  vpx_free(cm->last_frame_seg_map);
@@ -125,12 +128,15 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
 int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;

+  // TODO(agrange): this should be conditionally allocated.
  if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
    goto fail;
+#endif

  set_mb_mi(cm, aligned_width, aligned_height);

@@ -165,36 +171,58 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {

 fail:
  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
  return 1;
 }

+static void init_frame_bufs(VP9_COMMON *cm) {
+  int i;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  cm->new_fb_idx = FRAME_BUFFERS - 1;
+  frame_bufs[cm->new_fb_idx].ref_count = 1;
+
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = i;
+    frame_bufs[i].ref_count = 1;
+  }
+}
+
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  int i;
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;
-  int i;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  vp9_free_frame_buffers(cm);

-  for (i = 0; i < FRAME_BUFFERS; i++) {
-    cm->frame_bufs[i].ref_count = 0;
-    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    frame_bufs[i].ref_count = 0;
+    if (vp9_alloc_frame_buffer(&frame_bufs[i].buf, width, height,
                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
      goto fail;
  }

-  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
-  for (i = 0; i < REF_FRAMES; i++) {
-    cm->ref_frame_map[i] = i;
-    cm->frame_bufs[i].ref_count = 1;
-  }
+  init_frame_bufs(cm);

+#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9_ENC_BORDER_IN_PIXELS) < 0)
    goto fail;
+#endif
+
+  return 0;
+
+ fail:
+  vp9_free_frame_buffers(cm);
+  return 1;
+}
+
+int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
+  vp9_free_context_buffers(cm);

  set_mb_mi(cm, aligned_width, aligned_height);

@@ -224,13 +252,14 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
  return 0;

 fail:
-  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
  return 1;
 }

 void vp9_remove_common(VP9_COMMON *cm) {
  vp9_free_frame_buffers(cm);
-  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
+  vp9_free_context_buffers(cm);
+  vp9_free_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
 }

 void vp9_update_frame_size(VP9_COMMON *cm) {
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -23,8 +23,12 @@ void vp9_remove_common(struct VP9Common *cm);
 int vp9_resize_frame_buffers(struct VP9Common *cm, int width, int height);

 int vp9_alloc_frame_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);

 void vp9_free_frame_buffers(struct VP9Common *cm);
+void vp9_free_state_buffers(struct VP9Common *cm);
+void vp9_free_context_buffers(struct VP9Common *cm);

 void vp9_update_frame_size(struct VP9Common *cm);

--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -44,7 +44,7 @@ void vp9_foreach_transformed_block_in_plane(
  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
  // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd)
                                : mbmi->tx_size;
  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -270,18 +270,20 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,

 void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);

-static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
+static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
+                                          int xss, int yss) {
  if (bsize < BLOCK_8X8) {
    return TX_4X4;
  } else {
-    // TODO(dkovalev): Assuming YUV420 (ss_x == 1, ss_y == 1)
-    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][1][1];
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
  }
 }

-static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type);
+static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                     const struct macroblockd_plane *pd) {
+  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
+                             pd->subsampling_y);
 }

 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -117,17 +117,25 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
                     const InterpKernel *const y_filters,
                     int y0_q4, int y_step_q4,
                     int w, int h) {
-  // Fixed size intermediate buffer places limits on parameters.
-  // Maximum intermediate_height is 324, for y_step_q4 == 80,
-  // h == 64, taps == 8.
-  // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
-  uint8_t temp[64 * 324];
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint8_t temp[135 * 64];
  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;

  assert(w <= 64);
  assert(h <= 64);
-  assert(y_step_q4 <= 80);
-  assert(x_step_q4 <= 80);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);

  if (intermediate_height < h)
    intermediate_height = h;
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c
@@ -76,6 +76,7 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
 int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
  (void)cb_priv;
-  int_fb->in_use = 0;
+  if (int_fb)
+    int_fb->in_use = 0;
  return 0;
 }
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -502,7 +502,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
  const MB_MODE_INFO *mbmi = &mi->mbmi;
  const BLOCK_SIZE block_size = mbmi->sb_type;
  const TX_SIZE tx_size_y = mbmi->tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
  const int filter_level = get_filter_level(lfi_n, mbmi);
  uint64_t *const left_y = &lfm->left_y[tx_size_y];
  uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -939,7 +939,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
      const int skip_this_r = skip_this && !block_edge_above;
      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi)
+                            ? get_uv_tx_size(&mi[0].mbmi, plane)
                            : mi[0].mbmi.tx_size;
      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,181 +11,6 @@

 #include "vp9/common/vp9_mvref_common.h"

-#define MVREF_NEIGHBOURS 8
-
-typedef struct position {
-  int row;
-  int col;
-} POSITION;
-
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[MB_MODE_COUNT] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D207_PRED
-  9,  // D63_PRED
-  9,  // TM_PRED
-  0,  // NEARESTMV
-  0,  // NEARMV
-  3,  // ZEROMV
-  1,  // NEWMV
-};
-
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,  // 0
-  NEW_PLUS_NON_INTRA,  // 1
-  BOTH_NEW,  // 2
-  ZERO_PLUS_PREDICTED,  // 3
-  NEW_PLUS_NON_INTRA,  // 4
-  INVALID_CASE,  // 5
-  BOTH_ZERO,  // 6
-  INVALID_CASE,  // 7
-  INVALID_CASE,  // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,  // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,  // 13
-  INVALID_CASE,  // 14
-  INVALID_CASE,  // 15
-  INVALID_CASE,  // 16
-  INVALID_CASE,  // 17
-  BOTH_INTRA  // 18
-};
-
-static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
-  // 4X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 4X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // 16X8
-  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // 16X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 16X32
-  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X16
-  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X32
-  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X64
-  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // 64X32
-  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
-};
-
-static const int idx_n_column_to_subblock[4][2] = {
-  {1, 2},
-  {1, 3},
-  {3, 2},
-  {3, 3}
-};
-
-// clamp_mv_ref
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-
-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
-                                      int search_col, int block_idx) {
-  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
-          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
-              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv];
-}
-
-
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
-  }
-  return mv;
-}
-
-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
-  do { \
-    if (refmv_count) { \
-      if ((mv).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (mv); \
-        goto Done; \
-      } \
-    } else { \
-      mv_ref_list[refmv_count++] = (mv); \
-    } \
-  } while (0)
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
-  do { \
-    if (is_inter_block(mbmi)) { \
-      if ((mbmi)->ref_frame[0] != ref_frame) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
-      if (has_second_ref(mbmi) && \
-          (mbmi)->ref_frame[1] != ref_frame && \
-          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
-    } \
-  } while (0)
-
-
-// Checks that the given mi_row, mi_col and search point
-// are inside the borders of the tile.
-static INLINE int is_inside(const TileInfo *const tile,
-                            int mi_col, int mi_row, int mi_rows,
-                            const POSITION *mi_pos) {
-  return !(mi_row + mi_pos->row < 0 ||
-           mi_col + mi_pos->col < tile->mi_col_start ||
-           mi_row + mi_pos->row >= mi_rows ||
-           mi_col + mi_pos->col >= tile->mi_col_end);
-}
-
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -21,6 +21,181 @@ extern "C" {
 #define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
                                VP9_INTERP_EXTEND) << 3)

+#define MVREF_NEIGHBOURS 8
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+};
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv];
+}
+
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(mv) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (mv); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_list[refmv_count++] = (mv); \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+    } \
+  } while (0)
+
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -11,6 +11,7 @@
 #ifndef VP9_COMMON_VP9_ONYXC_INT_H_
 #define VP9_COMMON_VP9_ONYXC_INT_H_

+#include <pthread.h>
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vp9_rtcd.h"
@@ -61,8 +62,40 @@ typedef struct {
  int ref_count;
  vpx_codec_frame_buffer_t raw_frame_buffer;
  YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // owner_thread_id indicates which FrameWorker owns this buffer. -1 means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  int owner_worker_id;
+
+  // Buffer has been decoded to (row, col) position. When first start decoding,
+  // they are reset to -1. If a frame has been fully decoded, row and col will
+  // be set to INT_MAX.
+  int row;
+  int col;
 } RefCntBuffer;

+typedef struct {
+  // Protect BufferPool from being accessed by several FrameWorkers at
+  // the same time during frame parallel decode.
+  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
 typedef struct VP9Common {
  struct vpx_internal_error_info  error;

@@ -89,10 +122,11 @@ typedef struct VP9Common {

  YV12_BUFFER_CONFIG *frame_to_show;

-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
-
  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */

+  // Prepare ref_frame_map for next frame. Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
  // roll new_fb_idx into it.

@@ -202,30 +236,33 @@ typedef struct VP9Common {

  int log2_tile_cols, log2_tile_rows;

-  // Private data associated with the frame buffer callbacks.
-  void *cb_priv;
-  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
-
-  // Handles memory for the codec.
-  InternalFrameBufferList int_frame_buffers;
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;

  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;
 } VP9_COMMON;

 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->frame_bufs[cm->new_fb_idx].buf;
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }

 static INLINE int get_free_fb(VP9_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  int i;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
+#endif
  for (i = 0; i < FRAME_BUFFERS; i++)
-    if (cm->frame_bufs[i].ref_count == 0)
+    if (frame_bufs[i].ref_count == 0)
      break;

  assert(i < FRAME_BUFFERS);
-  cm->frame_bufs[i].ref_count = 1;
+  frame_bufs[i].ref_count = 1;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
+#endif
  return i;
 }

--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -12,7 +12,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"

-#if 1
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
  4,       8,    8,    9,   10,   11,   12,   12,
  13,     14,   15,   16,   17,   18,   19,   19,
@@ -83,44 +82,6 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = {
  1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };

-void vp9_init_quant_tables(void) { }
-#else
-static int16_t dc_qlookup[QINDEX_RANGE];
-static int16_t ac_qlookup[QINDEX_RANGE];
-
-#define ACDC_MIN 8
-
-// TODO(dkovalev) move to common and reuse
-static double poly3(double a, double b, double c, double d, double x) {
-  return a*x*x*x + b*x*x + c*x + d;
-}
-
-void vp9_init_quant_tables() {
-  int i, val = 4;
-
-  // A "real" q of 1.0 forces lossless mode.
-  // In practice non lossless Q's between 1.0 and 2.0 (represented here by
-  // integer values from 5-7 give poor rd results (lower psnr and often
-  // larger size than the lossless encode. To block out those "not very useful"
-  // values we increment the ac and dc q lookup values by 4 after position 0.
-  ac_qlookup[0] = val;
-  dc_qlookup[0] = val;
-  val += 4;
-
-  for (i = 1; i < QINDEX_RANGE; i++) {
-    const int ac_val = val;
-
-    val = (int)(val * 1.01975);
-    if (val == ac_val)
-      ++val;
-
-    ac_qlookup[i] = (int16_t)ac_val;
-    dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9,
-                                                 0.5, ac_val));
-  }
-}
-#endif
-
 int16_t vp9_dc_quant(int qindex, int delta) {
  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
 }
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -22,8 +22,6 @@ extern "C" {
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8

-void vp9_init_quant_tables();
-
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);

--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -113,6 +113,18 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
  return res;
 }

+static INLINE int round_mv_comp_q2(int value) {
+  return (value < 0 ? value - 1 : value + 1) / 2;
+}
+
+static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
+  MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row +
+                              mi->bmi[block1].as_mv[idx].as_mv.row),
+             round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col +
+                              mi->bmi[block1].as_mv[idx].as_mv.col) };
+  return res;
+}
+
 // TODO(jkoleszar): yet another mv clamping function :-(
 MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
                             int bw, int bh, int ss_x, int ss_y) {
@@ -139,6 +151,29 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
  return clamped_mv;
 }

+static MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
+                            const MODE_INFO *mi, int ref, int block) {
+  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
+  MV res = {0, 0};
+  switch (ss_idx) {
+    case 0:
+      res = mi->bmi[block].as_mv[ref].as_mv;
+      break;
+    case 1:
+      res = mi_mv_pred_q2(mi, ref, block, block + 2);
+      break;
+    case 2:
+      res = mi_mv_pred_q2(mi, ref, block, block + 1);
+      break;
+    case 3:
+      res = mi_mv_pred_q4(mi, ref);
+      break;
+    default:
+      assert(ss_idx <= 3 || ss_idx >= 0);
+  }
+  return res;
+}
+
 static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                   int bw, int bh,
                                   int x, int y, int w, int h,
@@ -154,14 +189,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
    struct buf_2d *const pre_buf = &pd->pre[ref];
    struct buf_2d *const dst_buf = &pd->dst;
    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-
-    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
-    // same MV (the average of the 4 luma MVs) but we could do something
-    // smarter for non-4:2:0. Just punt for now, pending the changes to get
-    // rid of SPLITMV mode entirely.
    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
-                             : mi_mv_pred_q4(mi, ref))
+               ? average_split_mvs(pd, plane, mi, ref, block)
               : mi->mbmi.mv[ref].as_mv;

    // TODO(jkoleszar): This clamping is done in the incorrect place for the
@@ -258,16 +287,11 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
    struct buf_2d *const pre_buf = &pd->pre[ref];
    struct buf_2d *const dst_buf = &pd->dst;
    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-
-    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
-    // same MV (the average of the 4 luma MVs) but we could do something
-    // smarter for non-4:2:0. Just punt for now, pending the changes to get
-    // rid of SPLITMV mode entirely.
    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
-                             : mi_mv_pred_q4(mi, ref))
+               ? average_split_mvs(pd, plane, mi, ref, block)
               : mi->mbmi.mv[ref].as_mv;

+
    // TODO(jkoleszar): This clamping is done in the incorrect place for the
    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
    // MV. Note however that it performs the subsampling aware scaling so
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -714,6 +714,9 @@ specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vp9_subtract_block/, "$sse2_x86inc";

+add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
+
 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c
@@ -33,14 +33,6 @@ static int get_fixed_point_scale_factor(int other_size, int this_size) {
  return (other_size << REF_SCALE_SHIFT) / this_size;
 }

-static int check_scale_factors(int other_w, int other_h,
-                               int this_w, int this_h) {
-  return 2 * this_w >= other_w &&
-         2 * this_h >= other_h &&
-         this_w <= 16 * other_w &&
-         this_h <= 16 * other_h;
-}
-
 MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
@@ -54,7 +46,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                       int other_w, int other_h,
                                       int this_w, int this_h) {
-  if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
    sf->x_scale_fp = REF_INVALID_SCALE;
    sf->y_scale_fp = REF_INVALID_SCALE;
    return;
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -46,8 +46,16 @@ static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
 }

 static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_NO_SCALE ||
-         sf->y_scale_fp != REF_NO_SCALE;
+  return vp9_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                      int this_width, int this_height) {
+  return 2 * this_width >= ref_width &&
+         2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width &&
+         this_height <= 16 * ref_height;
 }

 #ifdef __cplusplus
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -410,13 +410,17 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vp9_reader* r, BLOCK_SIZE bsize) {
  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
+  BLOCK_SIZE subsize, uv_subsize;

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
  subsize = get_subsize(bsize, partition);
+  uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y];
+  if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid block size.");
  if (subsize < BLOCK_8X8) {
    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
  } else {
@@ -617,6 +621,7 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
 }

 static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
+  BufferPool *const pool = cm->buffer_pool;
  if (cm->width != width || cm->height != height) {
    // Change in frame size.
    // TODO(agrange) Don't test width/height, check overall size.
@@ -636,8 +641,8 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
  if (vp9_realloc_frame_buffer(
          get_frame_new_buffer(cm), cm->width, cm->height,
          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
@@ -667,9 +672,17 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
  if (!found)
    read_frame_size(rb, &width, &height);

-  if (width <= 0 || height <= 0)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Referenced frame with invalid size");
+  // Check that each of the frames that this frame references has valid
+  // dimensions.
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    const int ref_width = ref_frame->buf->y_width;
+    const int ref_height = ref_frame->buf->y_height;
+
+    if (!valid_ref_frame_size(ref_width, ref_height, width, height))
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has invalid size");
+  }

  apply_frame_size(cm, width, height);
  setup_display_size(cm, rb);
@@ -685,6 +698,10 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
  while (max_ones-- && vp9_rb_read_bit(rb))
    cm->log2_tile_cols++;

+  if (cm->log2_tile_cols > 6)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid number of tile columns");
+
  // rows
  cm->log2_tile_rows = vp9_rb_read_bit(rb);
  if (cm->log2_tile_rows)
@@ -755,6 +772,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
                                   const uint8_t *data,
                                   const uint8_t *data_end) {
  VP9_COMMON *const cm = &pbi->common;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
  const int tile_cols = 1 << cm->log2_tile_cols;
  const int tile_rows = 1 << cm->log2_tile_rows;
@@ -767,7 +785,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                    vpx_memalign(32, sizeof(LFWorkerData)));
    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
-    if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Loop filter thread creation failed");
    }
@@ -853,13 +871,13 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        // decoding has completed: finish up the loop filter in this thread.
        if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;

-        vp9_worker_sync(&pbi->lf_worker);
+        winterface->sync(&pbi->lf_worker);
        lf_data->start = lf_start;
        lf_data->stop = mi_row;
        if (pbi->max_threads > 1) {
-          vp9_worker_launch(&pbi->lf_worker);
+          winterface->launch(&pbi->lf_worker);
        } else {
-          vp9_worker_execute(&pbi->lf_worker);
+          winterface->execute(&pbi->lf_worker);
        }
      }
    }
@@ -868,10 +886,10 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
  // Loopfilter remaining rows in the frame.
  if (cm->lf.filter_level) {
    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-    vp9_worker_sync(&pbi->lf_worker);
+    winterface->sync(&pbi->lf_worker);
    lf_data->start = lf_data->stop;
    lf_data->stop = cm->mi_rows;
-    vp9_worker_execute(&pbi->lf_worker);
+    winterface->execute(&pbi->lf_worker);
  }

  // Get last tile data.
@@ -915,6 +933,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
                                      const uint8_t *data,
                                      const uint8_t *data_end) {
  VP9_COMMON *const cm = &pbi->common;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  const uint8_t *bit_reader_end = NULL;
  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -941,11 +960,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
      VP9Worker *const worker = &pbi->tile_workers[i];
      ++pbi->num_tile_workers;

-      vp9_worker_init(worker);
+      winterface->init(worker);
      CHECK_MEM_ERROR(cm, worker->data1,
                      vpx_memalign(32, sizeof(TileWorkerData)));
      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
-      if (i < num_threads - 1 && !vp9_worker_reset(worker)) {
+      if (i < num_threads - 1 && !winterface->reset(worker)) {
        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                           "Tile decoder thread creation failed");
      }
@@ -1008,9 +1027,9 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,

      worker->had_error = 0;
      if (i == num_workers - 1 || n == tile_cols - 1) {
-        vp9_worker_execute(worker);
+        winterface->execute(worker);
      } else {
-        vp9_worker_launch(worker);
+        winterface->launch(worker);
      }

      if (buf->col == tile_cols - 1) {
@@ -1022,7 +1041,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,

    for (; i > 0; --i) {
      VP9Worker *const worker = &pbi->tile_workers[i - 1];
-      pbi->mb.corrupted |= !vp9_worker_sync(worker);
+      pbi->mb.corrupted |= !winterface->sync(worker);
    }
    if (final_worker > -1) {
      TileWorkerData *const tile_data =
@@ -1058,8 +1077,9 @@ static BITSTREAM_PROFILE read_profile(struct vp9_read_bit_buffer *rb) {
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                       struct vp9_read_bit_buffer *rb) {
  VP9_COMMON *const cm = &pbi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int mask, i, ref_index = 0;
  size_t sz;
-  int i;

  cm->last_frame_type = cm->frame_type;

@@ -1077,12 +1097,18 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
    // Show an existing frame directly.
    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];

-    if (cm->frame_bufs[frame_to_show].ref_count < 1)
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
+#endif
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1)
      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                         "Buffer %d does not contain a decoded frame",
                         frame_to_show);

-    ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
+#endif
    pbi->refresh_frame_flags = 0;
    cm->lf.filter_level = 0;
    cm->show_frame = 1;
@@ -1138,12 +1164,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
      setup_frame_size(cm, rb);
    } else {
      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
-
      for (i = 0; i < REFS_PER_FRAME; ++i) {
        const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
        const int idx = cm->ref_frame_map[ref];
-        cm->frame_refs[i].idx = idx;
-        cm->frame_refs[i].buf = &cm->frame_bufs[idx].buf;
+        RefBuffer *const ref_frame = &cm->frame_refs[i];
+        ref_frame->idx = idx;
+        ref_frame->buf = &frame_bufs[idx].buf;
        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
      }

@@ -1178,6 +1204,28 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
  // below, forcing the use of context 0 for those frame types.
  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);

+  // Update next_ref_frame_map in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+      if (mask & 1) {
+        cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+#if CONFIG_MULTITHREAD
+        pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
+#endif
+        ++cm->buffer_pool->frame_bufs[cm->new_fb_idx].ref_count;
+#if CONFIG_MULTITHREAD
+        pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
+#endif
+      } else {
+        cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+      }
+      ++ref_index;
+    }
+
+    for (; ref_index < REF_FRAMES; ++ref_index)
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+  }
+
  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
    vp9_setup_past_independence(cm);

@@ -1322,7 +1370,8 @@ void vp9_decode_frame(VP9Decoder *pbi,
                      const uint8_t **p_data_end) {
  VP9_COMMON *const cm = &pbi->common;
  MACROBLOCKD *const xd = &pbi->mb;
-  struct vp9_read_bit_buffer rb = { 0 };
+  struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
+
  uint8_t clear_data[MAX_VP9_HEADER_SIZE];
  const size_t first_partition_size = read_uncompressed_header(pbi,
      init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
@@ -1373,6 +1422,17 @@ void vp9_decode_frame(VP9Decoder *pbi,

  new_fb->corrupted |= xd->corrupted;

+  // Update progress in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    VP9Worker *worker = pbi->owner_frame_worker;
+    FrameWorkerData *const worker_data = worker->data1;
+    pthread_mutex_lock(&worker_data->stats_mutex);
+    pbi->cur_buf->row = INT_MAX;
+    pbi->cur_buf->col = INT_MAX;
+    pthread_cond_signal(&worker_data->stats_cond);
+    pthread_mutex_unlock(&worker_data->stats_mutex);
+  }
+
  if (!new_fb->corrupted) {
    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
      vp9_adapt_coef_probs(cm);
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -37,12 +37,11 @@ static void initialize_dec() {

  if (!init_done) {
    vp9_init_neighbors();
-    vp9_init_quant_tables();
    init_done = 1;
  }
 }

-VP9Decoder *vp9_decoder_create() {
+VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
  VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;

@@ -64,9 +63,11 @@ VP9Decoder *vp9_decoder_create() {

  // Initialize the references to not point to any frame buffers.
  vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));

  cm->current_video_frame = 0;
  pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;

  // vp9_init_dequantizer() is first called here. Add check in
  // frame_init_dequantizer() to avoid unnecessary calling of
@@ -77,7 +78,7 @@ VP9Decoder *vp9_decoder_create() {

  cm->error.setjmp = 0;

-  vp9_worker_init(&pbi->lf_worker);
+  vp9_get_worker_interface()->init(&pbi->lf_worker);

  return pbi;
 }
@@ -87,12 +88,12 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
  int i;

  vp9_remove_common(cm);
-  vp9_worker_end(&pbi->lf_worker);
+  vp9_get_worker_interface()->end(&pbi->lf_worker);
  vpx_free(pbi->lf_worker.data1);
  vpx_free(pbi->tile_data);
  for (i = 0; i < pbi->num_tile_workers; ++i) {
    VP9Worker *const worker = &pbi->tile_workers[i];
-    vp9_worker_end(worker);
+    vp9_get_worker_interface()->end(worker);
    vpx_free(worker->data1);
    vpx_free(worker->data2);
  }
@@ -125,7 +126,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
   */
  if (ref_frame_flag == VP9_LAST_FLAG) {
    const YV12_BUFFER_CONFIG *const cfg =
-        &cm->frame_bufs[cm->ref_frame_map[0]].buf;
+        &cm->buffer_pool->frame_bufs[cm->ref_frame_map[0]].buf;
    if (!equal_dimensions(cfg, sd))
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Incorrect buffer dimensions");
@@ -144,6 +145,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                      VP9_REFFRAME ref_frame_flag,
                                      YV12_BUFFER_CONFIG *sd) {
  RefBuffer *ref_buf = NULL;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
  // encoder is using the frame buffers for. This is just a stub to keep the
@@ -171,11 +173,11 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
    const int free_fb = get_free_fb(cm);
    // Decrease ref_count since it will be increased again in
    // ref_cnt_fb() below.
-    cm->frame_bufs[free_fb].ref_count--;
+    --frame_bufs[free_fb].ref_count;

    // Manage the reference counters and copy image.
-    ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf;
+    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
    vp8_yv12_copy_frame(sd, ref_buf->buf);
  }

@@ -185,35 +187,44 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,

 int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
  VP9_COMMON *cm = &pbi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  if (index < 0 || index >= REF_FRAMES)
    return -1;

-  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+  *fb = &frame_bufs[cm->ref_frame_map[index]].buf;
  return 0;
 }

 /* If any buffer updating is signaled it should be done here. */
 static void swap_frame_buffers(VP9Decoder *pbi) {
  int ref_index = 0, mask;
-  VP9_COMMON *const cm = &pbi->common;
+  VP9_COMMON * const cm = &pbi->common;
+  BufferPool * const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
    if (mask & 1) {
      const int old_idx = cm->ref_frame_map[ref_index];
-      ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
+#endif
+      ref_cnt_fb(frame_bufs, &cm->ref_frame_map[ref_index],
                 cm->new_fb_idx);
-      if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
-        cm->release_fb_cb(cm->cb_priv,
-                          &cm->frame_bufs[old_idx].raw_frame_buffer);
+      if (old_idx >= 0 && frame_bufs[old_idx].ref_count == 0)
+        pool->release_fb_cb(pool->cb_priv,
+                            &frame_bufs[old_idx].raw_frame_buffer);
    }
+#if CONFIG_MULTITHREAD
+      pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
+#endif
    ++ref_index;
  }

  cm->frame_to_show = get_frame_new_buffer(cm);

  if (!pbi->frame_parallel_decode || !cm->show_frame) {
-    --cm->frame_bufs[cm->new_fb_idx].ref_count;
+    --frame_bufs[cm->new_fb_idx].ref_count;
  }

  // Invalidate these references until the next frame starts.
@@ -224,6 +235,8 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
 int vp9_receive_compressed_data(VP9Decoder *pbi,
                                size_t size, const uint8_t **psource) {
  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  const uint8_t *source = *psource;
  int retcode = 0;

@@ -244,12 +257,23 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,

  // Check if the previous frame was a frame without any references to it.
  // Release frame buffer if not decoding in frame parallel mode.
-  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 &&
-         cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
-    cm->release_fb_cb(cm->cb_priv,
-                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
  cm->new_fb_idx = get_free_fb(cm);

+  if (pbi->frame_parallel_decode) {
+    VP9Worker *worker = pbi->owner_frame_worker;
+    FrameWorkerData *const worker_data = worker->data1;
+    pbi->cur_buf = &pool->frame_bufs[cm->new_fb_idx];
+    pool->frame_bufs[cm->new_fb_idx].owner_worker_id = worker_data->worker_id;
+
+    // Reset the decoding progress.
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+  }
+
  if (setjmp(cm->error.jmp)) {
    cm->error.setjmp = 0;

@@ -260,11 +284,11 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
    // TODO(jkoleszar): Error concealment is undefined and non-normative
    // at this point, but if it becomes so, [0] may not always be the correct
    // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX)
+    if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
      cm->frame_refs[0].buf->corrupted = 1;

-    if (cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
-      cm->frame_bufs[cm->new_fb_idx].ref_count--;
+    if (frame_bufs[cm->new_fb_idx].ref_count > 0)
+      --frame_bufs[cm->new_fb_idx].ref_count;

    return -1;
  }
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -45,8 +45,14 @@ typedef struct VP9Decoder {

  int frame_parallel_decode;  // frame-based threading.

+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;  //  current decoding reference buffer.
+
  VP9Worker lf_worker;
  VP9Worker *tile_workers;
+  VP9Worker *owner_frame_worker;   // frame_worker that owns this pbi;
+
  int num_tile_workers;

  TileData *tile_data;
@@ -78,7 +84,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
 int vp9_get_reference_dec(struct VP9Decoder *pbi,
                          int index, YV12_BUFFER_CONFIG **fb);

-struct VP9Decoder *vp9_decoder_create();
+struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);

 void vp9_decoder_remove(struct VP9Decoder *pbi);

--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -124,7 +124,7 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
 static int loop_filter_row_worker(void *arg1, void *arg2) {
  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
  LFWorkerData *const lf_data = &tile_data->lfdata;
-
+  (void) arg2;
  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                      lf_data->start, lf_data->stop, lf_data->y_only,
                      lf_data->lf_sync, lf_data->num_lf_workers);
@@ -138,6 +138,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                              int frame_filter_level,
                              int y_only) {
  VP9LfSync *const lf_sync = &pbi->lf_row_sync;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  // Number of superblock rows and cols
  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -197,15 +198,15 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

    // Start loopfiltering
    if (i == num_workers - 1) {
-      vp9_worker_execute(worker);
+      winterface->execute(worker);
    } else {
-      vp9_worker_launch(worker);
+      winterface->launch(worker);
    }
  }

  // Wait till all rows are finished
  for (i = 0; i < num_workers; ++i) {
-    vp9_worker_sync(&pbi->tile_workers[i]);
+    winterface->sync(&pbi->tile_workers[i]);
  }
 }

@@ -278,3 +279,78 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
    vp9_zero(*lf_sync);
  }
 }
+
+void vp9_frameworker_wait(VP9Worker* const worker, int row, int col,
+                          RefCntBuffer *ref_buf) {
+  FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+  const VP9Decoder *const pbi = worker_data->pbi;
+  const RefCntBuffer *const cur_buf = pbi->cur_buf;
+
+  // Check if worker already release the ref_buf.
+  if (!worker || ref_buf->owner_worker_id == -1) return;
+
+  pthread_mutex_lock(&worker_data->stats_mutex);
+  while (!(cur_buf->row >= row && cur_buf->col >= col)
+         && pbi->cur_buf == ref_buf && ref_buf->owner_worker_id != -1) {
+    pthread_cond_wait(&worker_data->stats_cond, &worker_data->stats_mutex);
+  }
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+}
+
+void vp9_frameworker_broadcast(VP9Worker* const worker, int row, int col) {
+  FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+  const VP9Decoder *const pbi = worker_data->pbi;
+  RefCntBuffer *const cur_buf = pbi->cur_buf;
+
+  pthread_mutex_lock(&worker_data->stats_mutex);
+  cur_buf->row = row;
+  cur_buf->col = col;
+  pthread_cond_signal(&worker_data->stats_cond);
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+}
+
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                  const VP9Worker *const src_worker) {
+  FrameWorkerData *const src_worker_data =
+      (FrameWorkerData *)dst_worker->data1;
+  FrameWorkerData *const dst_worker_data =
+      (FrameWorkerData *)src_worker->data1;
+  const VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
+  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
+
+  // Wait until source frame's context is ready.
+  pthread_mutex_lock(&src_worker_data->stats_mutex);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+                      &src_worker_data->stats_mutex);
+  }
+  pthread_mutex_unlock(&src_worker_data->stats_mutex);
+
+  dst_cm->last_width = src_cm->width;
+  dst_cm->last_height = src_cm->height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  dst_cm->last_show_frame = src_cm->show_frame;
+
+  dst_cm->prev_mip = src_cm->mip;
+  dst_cm->prev_mi = src_cm->mi;
+  dst_cm->prev_mi_grid_base = src_cm->mi_grid_base;
+  dst_cm->prev_mi_grid_visible = src_cm->mi_grid_visible;
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+
+  for (i = 0; i < MAX_REF_LF_DELTAS; ++i) {
+    dst_cm->lf.last_ref_deltas[i] = src_cm->lf.ref_deltas[i];
+    dst_cm->lf.ref_deltas[i] = src_cm->lf.ref_deltas[i];
+  }
+
+  for (i = 0; i < MAX_MODE_LF_DELTAS; ++i)
+    dst_cm-> lf.last_mode_deltas[i] = src_cm->lf.mode_deltas[i];
+
+  for (i = 0; i < FRAME_CONTEXTS; ++i)
+    dst_cm-> frame_contexts[i] = src_cm->frame_contexts[i];
+}
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -40,6 +40,28 @@ typedef struct VP9LfSyncData {
  int sync_range;
 } VP9LfSync;

+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+} FrameWorkerData;
+
 // Allocate memory for loopfilter row synchronization.
 void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
                           int rows, int width);
@@ -54,4 +76,19 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                              int frame_filter_level,
                              int y_only);

+// Wait for FrameWorker to finish decoding ref_buf to (r,c) position.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp9_frameworker_wait(VP9Worker* const worker, int row, int col,
+                          RefCntBuffer *ref_buf);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting it could resume decoding.
+void vp9_frameworker_broadcast(VP9Worker* const worker, int row, int col);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                 const VP9Worker *const src_worker);
+
 #endif  // VP9_DECODER_VP9_DTHREAD_H_
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vp9/decoder/vp9_read_bit_buffer.c
@@ -10,7 +10,7 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"

 size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);
+  return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
 }

 int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/decoder/vp9_thread.c
@@ -11,71 +11,79 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob eff8f2a8c20095aade3c292b0e9292dac6cb3587  src/utils/thread.c
-
+//  100644 blob 08ad4e1fecba302bf1247645e84a7d2779956bc3  src/utils/thread.c

 #include <assert.h>
 #include <string.h>   // for memset()
 #include "./vp9_thread.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "vpx_mem/vpx_mem.h"

 #if CONFIG_MULTITHREAD

+struct VP9WorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+};
+
 //------------------------------------------------------------------------------

-static THREADFN thread_loop(void *ptr) {    // thread loop
-  VP9Worker* const worker = (VP9Worker*)ptr;
+static void execute(VP9Worker *const worker);  // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+  VP9Worker *const worker = (VP9Worker*)ptr;
  int done = 0;
  while (!done) {
-    pthread_mutex_lock(&worker->mutex_);
+    pthread_mutex_lock(&worker->impl_->mutex_);
    while (worker->status_ == OK) {   // wait in idling mode
-      pthread_cond_wait(&worker->condition_, &worker->mutex_);
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
    }
    if (worker->status_ == WORK) {
-      vp9_worker_execute(worker);
+      execute(worker);
      worker->status_ = OK;
    } else if (worker->status_ == NOT_OK) {   // finish the worker
      done = 1;
    }
-    // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&worker->condition_);
-    pthread_mutex_unlock(&worker->mutex_);
+    // signal to the main thread that we're done (for sync())
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
  }
  return THREAD_RETURN(NULL);    // Thread is finished
 }

 // main thread state control
-static void change_state(VP9Worker* const worker,
+static void change_state(VP9Worker *const worker,
                         VP9WorkerStatus new_status) {
-  // no-op when attempting to change state on a thread that didn't come up
-  if (worker->status_ < OK) return;
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;

-  pthread_mutex_lock(&worker->mutex_);
-  // wait for the worker to finish
-  while (worker->status_ != OK) {
-    pthread_cond_wait(&worker->condition_, &worker->mutex_);
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
  }
-  // assign new status and release the working thread if needed
-  if (new_status != OK) {
-    worker->status_ = new_status;
-    pthread_cond_signal(&worker->condition_);
-  }
-  pthread_mutex_unlock(&worker->mutex_);
+  pthread_mutex_unlock(&worker->impl_->mutex_);
 }

 #endif  // CONFIG_MULTITHREAD

 //------------------------------------------------------------------------------

-void vp9_worker_init(VP9Worker* const worker) {
+static void init(VP9Worker *const worker) {
  memset(worker, 0, sizeof(*worker));
  worker->status_ = NOT_OK;
 }

-int vp9_worker_sync(VP9Worker* const worker) {
+static int sync(VP9Worker *const worker) {
 #if CONFIG_MULTITHREAD
  change_state(worker, OK);
 #endif
@@ -83,59 +91,93 @@ int vp9_worker_sync(VP9Worker* const worker) {
  return !worker->had_error;
 }

-int vp9_worker_reset(VP9Worker* const worker) {
+static int reset(VP9Worker *const worker) {
  int ok = 1;
  worker->had_error = 0;
  if (worker->status_ < OK) {
 #if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&worker->mutex_, NULL) ||
-        pthread_cond_init(&worker->condition_, NULL)) {
+    worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
      return 0;
    }
-    pthread_mutex_lock(&worker->mutex_);
-    ok = !pthread_create(&worker->thread_, NULL, thread_loop, worker);
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
    if (ok) worker->status_ = OK;
-    pthread_mutex_unlock(&worker->mutex_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+      vpx_free(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
 #else
    worker->status_ = OK;
 #endif
  } else if (worker->status_ > OK) {
-    ok = vp9_worker_sync(worker);
+    ok = sync(worker);
  }
  assert(!ok || (worker->status_ == OK));
  return ok;
 }

-void vp9_worker_execute(VP9Worker* const worker) {
+static void execute(VP9Worker *const worker) {
  if (worker->hook != NULL) {
    worker->had_error |= !worker->hook(worker->data1, worker->data2);
  }
 }

-void vp9_worker_launch(VP9Worker* const worker) {
+static void launch(VP9Worker *const worker) {
 #if CONFIG_MULTITHREAD
  change_state(worker, WORK);
 #else
-  vp9_worker_execute(worker);
+  execute(worker);
 #endif
 }

-void vp9_worker_end(VP9Worker* const worker) {
+static void end(VP9Worker *const worker) {
  if (worker->status_ >= OK) {
 #if CONFIG_MULTITHREAD
    change_state(worker, NOT_OK);
-    pthread_join(worker->thread_, NULL);
-    pthread_mutex_destroy(&worker->mutex_);
-    pthread_cond_destroy(&worker->condition_);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
 #else
    worker->status_ = NOT_OK;
 #endif
  }
+  vpx_free(worker->impl_);
+  worker->impl_ = NULL;
  assert(worker->status_ == NOT_OK);
 }

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+static VP9WorkerInterface g_worker_interface = {
+  init, reset, sync, launch, execute, end
+};
+
+int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) {
+  if (winterface == NULL ||
+      winterface->init == NULL || winterface->reset == NULL ||
+      winterface->sync == NULL || winterface->launch == NULL ||
+      winterface->execute == NULL || winterface->end == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const VP9WorkerInterface *vp9_get_worker_interface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
--- a/vp9/decoder/vp9_thread.h
+++ b/vp9/decoder/vp9_thread.h
@@ -11,8 +11,7 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob 13a61a4c84194c3374080cbf03d881d3cd6af40d  src/utils/thread.h
-
+//  100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38  src/utils/thread.h

 #ifndef VP9_DECODER_VP9_THREAD_H_
 #define VP9_DECODER_VP9_THREAD_H_
@@ -163,40 +162,53 @@ typedef enum {
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*VP9WorkerHook)(void*, void*);

-// Synchronize object used to launch job in the worker thread
+// Platform-dependent implementation details for the worker.
+typedef struct VP9WorkerImpl VP9WorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
 typedef struct {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t mutex_;
-  pthread_cond_t  condition_;
-  pthread_t       thread_;
-#endif
+  VP9WorkerImpl *impl_;
  VP9WorkerStatus status_;
  VP9WorkerHook hook;     // hook to call
-  void* data1;            // first argument passed to 'hook'
-  void* data2;            // second argument passed to 'hook'
+  void *data1;            // first argument passed to 'hook'
+  void *data2;            // second argument passed to 'hook'
  int had_error;          // return value of the last call to 'hook'
 } VP9Worker;

-// Must be called first, before any other method.
-void vp9_worker_init(VP9Worker* const worker);
-// Must be called to initialize the object and spawn the thread. Re-entrant.
-// Will potentially launch the thread. Returns false in case of error.
-int vp9_worker_reset(VP9Worker* const worker);
-// Makes sure the previous work is finished. Returns true if worker->had_error
-// was not set and no error condition was triggered by the working thread.
-int vp9_worker_sync(VP9Worker* const worker);
-// Triggers the thread to call hook() with data1 and data2 argument. These
-// hook/data1/data2 can be changed at any time before calling this function,
-// but not be changed afterward until the next call to vp9_worker_sync().
-void vp9_worker_launch(VP9Worker* const worker);
-// This function is similar to vp9_worker_launch() except that it calls the
-// hook directly instead of using a thread. Convenient to bypass the thread
-// mechanism while still using the VP9Worker structs. vp9_worker_sync() must
-// still be called afterward (for error reporting).
-void vp9_worker_execute(VP9Worker* const worker);
-// Kill the thread and terminate the object. To use the object again, one
-// must call vp9_worker_reset() again.
-void vp9_worker_end(VP9Worker* const worker);
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*init)(VP9Worker *const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*reset)(VP9Worker *const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*sync)(VP9Worker *const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*launch)(VP9Worker *const worker);
+  // This function is similar to launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the VP9Worker structs. sync() must
+  // still be called afterward (for error reporting).
+  void (*execute)(VP9Worker *const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call reset() again.
+  void (*end)(VP9Worker *const worker);
+} VP9WorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int vp9_set_worker_interface(const VP9WorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const VP9WorkerInterface *vp9_get_worker_interface(void);

 //------------------------------------------------------------------------------

--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -15,8 +15,19 @@

 #include "vp9/encoder/vp9_segmentation.h"

-static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
-  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+#define AQ_C_SEGMENTS  3
+#define AQ_C_STRENGTHS  3
+static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3};
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
+
+static int get_aq_c_strength(int q_index) {
+  // Approximate base quatizer (truncated to int)
+  int base_quant = vp9_ac_quant(q_index, 0) / 4;
+  return (base_quant > 20) + (base_quant > 45);
+}

 void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
@@ -29,6 +40,8 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
      cpi->refresh_alt_ref_frame ||
      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
    int segment;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex);
+    const int active_segments = aq_c_active_segments[aq_strength];

    // Clear down the segment map.
    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
@@ -36,9 +49,17 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
    // Clear down the complexity map used for rd.
    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);

-    vp9_enable_segmentation(seg);
    vp9_clearall_segfeatures(seg);

+    // Segmentation only makes sense if the target bits per SB is above a
+    // threshold. Below this the overheads will usually outweigh any benefit.
+    if (cpi->rc.sb64_target_rate < 256) {
+      vp9_disable_segmentation(seg);
+      return;
+    }
+
+    vp9_enable_segmentation(seg);
+
    // Select delta coding method.
    seg->abs_delta = SEGMENT_DELTADATA;

@@ -46,14 +67,14 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);

    // Use some of the segments for in frame Q adjustment.
-    for (segment = 1; segment < 2; segment++) {
+    for (segment = 1; segment < active_segments; ++segment) {
      int qindex_delta =
          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     in_frame_q_adj_ratio[segment]);
+                                     aq_c_q_adj_factor[aq_strength][segment]);

-      // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
-      // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
-      // is sometimes applied without going back around the rd loop.
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
      // This could lead to an illegal combination of partition size and q.
      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
        qindex_delta = -cm->base_qindex + 1;
@@ -66,10 +87,15 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
  }
 }

-// Select a segment for the current SB64
+// Select a segment for the current SB64 block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average.
+// An "aq_strength" value determines how many segments are supported,
+// the set of transition points to use and the extent of the quantizer
+// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
 void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
-                                      int mi_row, int mi_col,
-                                      int output_enabled, int projected_rate) {
+                                   int mi_row, int mi_col,
+                                   int output_enabled, int projected_rate) {
  VP9_COMMON *const cm = &cpi->common;

  const int mi_offset = mi_row * cm->mi_cols + mi_col;
@@ -89,11 +115,22 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
    // It is converted to bits * 256 units.
    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
                            (bw * bh);
+    const int aq_strength = get_aq_c_strength(cm->base_qindex);
+    const int active_segments = aq_c_active_segments[aq_strength];

-    if (projected_rate < (target_rate / 4)) {
-      segment = 1;
-    } else {
-      segment = 0;
+    // The number of segments considered and the transition points used to
+    // select them is determined by the "aq_strength" value.
+    // Currently this loop only supports segments that reduce Q (i.e. where
+    // there is undershoot.
+    // The loop counts down towards segment 0 which is the default segment
+    // with no Q adjustment.
+    segment = active_segments - 1;
+    while (segment > 0) {
+      if (projected_rate <
+          (target_rate * aq_c_transitions[aq_strength][segment])) {
+        break;
+      }
+      --segment;
    }

    if (target_rate > 0) {
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -890,14 +890,8 @@ static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
 }

 static int get_refresh_mask(VP9_COMP *cpi) {
-    // Should the GF or ARF be updated using the transmitted frame or buffer
-#if CONFIG_MULTIPLE_ARF
-    if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
-        !cpi->refresh_alt_ref_frame) {
-#else
-    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
-        !cpi->use_svc) {
-#endif
+    if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+        cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
      // Preserve the previously existing golden frame and update the frame in
      // the alt ref slot instead. This is highly specific to the use of
      // alt-ref as a forward reference, and this needs to be generalized as
@@ -910,15 +904,10 @@ static int get_refresh_mask(VP9_COMP *cpi) {
             (cpi->refresh_golden_frame << cpi->alt_fb_idx);
    } else {
      int arf_idx = cpi->alt_fb_idx;
-#if CONFIG_MULTIPLE_ARF
-      // Determine which ARF buffer to use to encode this ARF frame.
-      if (cpi->multi_arf_enabled) {
-        int sn = cpi->sequence_number;
-        arf_idx = (cpi->frame_coding_order[sn] < 0) ?
-            cpi->arf_buffer_idx[sn + 1] :
-            cpi->arf_buffer_idx[sn];
+      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        arf_idx = gf_group->arf_update_idx[gf_group->index];
      }
-#endif
      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
             (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
             (cpi->refresh_alt_ref_frame << arf_idx);
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -20,6 +20,12 @@
 extern "C" {
 #endif

+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} diff;
+
 struct macroblock_plane {
  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
  int16_t *qcoeff;
@@ -29,6 +35,7 @@ struct macroblock_plane {

  // Quantizer setings
  int16_t *quant_fp;
+  int16_t *round_fp;
  int16_t *quant;
  int16_t *quant_shift;
  int16_t *zbin;
@@ -93,8 +100,6 @@ struct macroblock {

  int encode_breakout;

-  int in_active_map;
-
  // note that token_costs is the cost when eob node is skipped
  vp9_coeff_cost token_costs[TX_SIZES];

@@ -106,6 +111,9 @@ struct macroblock {
  int use_lp32x32fdct;
  int skip_encode;

+  // use fast quantization process
+  int quant_fp;
+
  // skip forward transform and quantization
  int skip_txfm;

--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -8,42 +8,378 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <stdio.h>
-#include <stdint.h>
-#include "vp9/encoder/vp9_denoiser.h"
+#include <assert.h>
+#include <limits.h>
 #include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_denoiser.h"
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif

 static const int widths[]  = {4, 4, 8, 8,  8, 16, 16, 16, 32, 32, 32, 64, 64};
 static const int heights[] = {4, 8, 4, 8, 16,  8, 16, 32, 16, 32, 64, 32, 64};

-int vp9_denoiser_filter() {
-  return 0;
+static VP9_DENOISER_DECISION update_running_avg(const uint8_t *mc_avg,
+                                                int mc_avg_stride,
+                                                uint8_t *avg, int avg_stride,
+                                                const uint8_t *sig,
+                                                int sig_stride,
+                                                int increase_denoising,
+                                                BLOCK_SIZE bs) {
+  int r, c;
+  int diff, adj, absdiff;
+  int shift_inc1 = 0, shift_inc2 = 1;
+  int adj_val[] = {3, 4, 6};
+  int total_adj = 0;
+
+  if (increase_denoising) {
+    shift_inc1 = 1;
+    shift_inc2 = 2;
+  }
+
+  for (r = 0; r < heights[bs]; ++r) {
+    for (c = 0; c < widths[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      absdiff = abs(diff);
+
+      if (absdiff <= 3 + shift_inc1) {
+        avg[c] = mc_avg[c];
+        total_adj += diff;
+      } else {
+        switch (absdiff) {
+          case 4: case 5: case 6: case 7:
+            adj = adj_val[0];
+            break;
+          case 8: case 9: case 10: case 11:
+          case 12: case 13: case 14: case 15:
+            adj = adj_val[1];
+            break;
+          default:
+            adj = adj_val[2];
+        }
+        if (diff > 0) {
+          avg[c] = MIN(UINT8_MAX, sig[c] + adj);
+          total_adj += adj;
+        } else {
+          avg[c] = MAX(0, sig[c] - adj);
+          total_adj -= adj;
+        }
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+  return total_adj;
 }

-void vp9_denoiser_denoise(VP9_DENOISER *denoiser,
-                          MACROBLOCK *mb, MODE_INFO **grid,
+static uint8_t *block_start(uint8_t *framebuf, int stride,
+                            int mi_row, int mi_col) {
+  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+}
+
+void copy_block(uint8_t *dest, int dest_stride,
+                uint8_t *src, int src_stride, BLOCK_SIZE bs) {
+  int r, c;
+  for (r = 0; r < heights[bs]; ++r) {
+    for (c = 0; c < widths[bs]; ++c) {
+      dest[c] = src[c];
+    }
+    dest += dest_stride;
+    src += src_stride;
+  }
+}
+
+static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
+                                                         MACROBLOCK *mb,
+                                                         BLOCK_SIZE bs,
+                                                         int increase_denoising,
+                                                         int mi_row,
+                                                         int mi_col) {
+  // constants
+  // TODO(tkopp): empirically determine good constants, or functions of block
+  // size.
+  int NOISE_MOTION_THRESHOLD = 25 * 25;
+  int SSE_DIFF_THRESHOLD = heights[bs] * widths[bs] * 20;
+  unsigned int SSE_THRESH = heights[bs] * widths[bs] * 40;
+  unsigned int SSE_THRESH_HI = heights[bs] * widths[bs] * 60;
+
+  int mv_col, mv_row;
+  int sse_diff = denoiser->zero_mv_sse - denoiser->best_sse;
+  int sse_diff_thresh;
+  int sse_thresh;
+  MV_REFERENCE_FRAME frame;
+  MACROBLOCKD *filter_mbd = &mb->e_mbd;
+  MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
+
+  // We will restore these after motion compensation.
+  MB_MODE_INFO saved_mbmi = *mbmi;
+  struct buf_2d saved_dst = filter_mbd->plane[0].dst;
+  struct buf_2d saved_pre[2];
+  saved_pre[0] = filter_mbd->plane[0].pre[0];
+  saved_pre[1] = filter_mbd->plane[0].pre[1];
+
+  // Decide the threshold for sum squared error.
+  mv_col = denoiser->best_sse_mv.as_mv.col;
+  mv_row = denoiser->best_sse_mv.as_mv.row;
+  if (mv_row * mv_row + mv_col * mv_col > NOISE_MOTION_THRESHOLD) {
+    sse_diff_thresh = 0;
+  } else {
+    sse_diff_thresh = SSE_DIFF_THRESHOLD;
+  }
+
+  frame = denoiser->best_reference_frame;
+
+  // If the best reference frame uses inter-prediction and there is enough of a
+  // difference in sum-squared-error, use it.
+  if (frame != INTRA_FRAME && sse_diff > sse_diff_thresh) {
+    mbmi->ref_frame[0] = denoiser->best_reference_frame;
+    mbmi->mode = denoiser->best_sse_inter_mode;
+    mbmi->mv[0] = denoiser->best_sse_mv;
+  } else {
+    // Otherwise, use the zero reference frame.
+    frame = denoiser->best_zeromv_reference_frame;
+
+    mbmi->ref_frame[0] = denoiser->best_zeromv_reference_frame;
+    mbmi->mode = ZEROMV;
+    mbmi->mv[0].as_int = 0;
+
+    denoiser->best_sse_inter_mode = ZEROMV;
+    denoiser->best_sse_mv.as_int = 0;
+    denoiser->best_sse = denoiser->zero_mv_sse;
+  }
+
+  // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+  // struct.
+  filter_mbd->plane[0].pre[0].buf =
+      block_start(denoiser->running_avg_y[frame].y_buffer,
+                  denoiser->running_avg_y[frame].y_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[0].pre[0].stride = denoiser->running_avg_y[frame].y_stride;
+
+  filter_mbd->plane[1].pre[0].buf =
+      block_start(denoiser->running_avg_y[frame].u_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[1].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
+
+  filter_mbd->plane[2].pre[0].buf =
+      block_start(denoiser->running_avg_y[frame].v_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
+
+  filter_mbd->plane[0].pre[1].buf =
+      block_start(denoiser->running_avg_y[frame].y_buffer,
+                  denoiser->running_avg_y[frame].y_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[0].pre[1].stride = denoiser->running_avg_y[frame].y_stride;
+
+  filter_mbd->plane[1].pre[1].buf =
+      block_start(denoiser->running_avg_y[frame].u_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[1].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
+
+  filter_mbd->plane[2].pre[1].buf =
+      block_start(denoiser->running_avg_y[frame].v_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[2].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
+
+  filter_mbd->plane[0].dst.buf =
+      block_start(denoiser->mc_running_avg_y.y_buffer,
+                  denoiser->mc_running_avg_y.y_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
+
+  filter_mbd->plane[1].dst.buf =
+      block_start(denoiser->mc_running_avg_y.u_buffer,
+                  denoiser->mc_running_avg_y.uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.y_stride;
+
+  filter_mbd->plane[2].dst.buf =
+      block_start(denoiser->mc_running_avg_y.v_buffer,
+                  denoiser->mc_running_avg_y.uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.y_stride;
+
+  vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
+
+  // Restore everything to its original state
+  filter_mbd->plane[0].pre[0] = saved_pre[0];
+  filter_mbd->plane[0].pre[1] = saved_pre[1];
+  filter_mbd->plane[0].dst = saved_dst;
+  *mbmi = saved_mbmi;
+
+  mv_row = denoiser->best_sse_mv.as_mv.row;
+  mv_col = denoiser->best_sse_mv.as_mv.col;
+  sse_thresh = denoiser->increase_denoising ? SSE_THRESH_HI : SSE_THRESH;
+
+  // TODO(tkopp) why 8?
+  if (denoiser->best_sse > sse_thresh ||
+    mv_row * mv_row + mv_col * mv_col > 8 * NOISE_MOTION_THRESHOLD) {
+    return COPY_BLOCK;
+  }
+  return FILTER_BLOCK;
+}
+
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                          int mi_row, int mi_col, BLOCK_SIZE bs) {
-  return;
+  VP9_DENOISER_DECISION decision = FILTER_BLOCK;
+
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+  uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
+                                          mi_row, mi_col);
+  struct buf_2d src = mb->plane[0].src;
+
+  decision = perform_motion_compensation(denoiser, mb, bs,
+                                         denoiser->increase_denoising,
+                                         mi_row, mi_col);
+  update_running_avg(mc_avg_start, mc_avg.y_stride, avg_start, avg.y_stride,
+                     mb->plane[0].src.buf, mb->plane[0].src.stride, 0, bs);
+
+  if (decision == FILTER_BLOCK) {
+    // TODO(tkopp)
+  }
+  if (decision == COPY_BLOCK) {
+    copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+  }
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
+  int r, c;
+  const uint8_t *srcbuf = src.y_buffer;
+  uint8_t *destbuf = dest.y_buffer;
+  assert(dest.y_width == src.y_width);
+  assert(dest.y_height == src.y_height);
+
+  for (r = 0; r < dest.y_height; ++r) {
+    for (c = 0; c < dest.y_width; ++c) {
+      destbuf[c] = srcbuf[c];
+    }
+    destbuf += dest.y_stride;
+    srcbuf += src.y_stride;
+  }
 }

 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
                                    FRAME_TYPE frame_type,
                                    int refresh_alt_ref_frame,
                                    int refresh_golden_frame,
                                    int refresh_last_frame) {
-  return;
+  if (frame_type == KEY_FRAME) {
+    int i;
+    copy_frame(denoiser->running_avg_y[LAST_FRAME], src);
+    for (i = 2; i < MAX_REF_FRAMES - 1; i++) {
+      copy_frame(denoiser->running_avg_y[i],
+                 denoiser->running_avg_y[LAST_FRAME]);
+    }
+  } else {  /* For non key frames */
+    if (refresh_alt_ref_frame) {
+      copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_golden_frame) {
+      copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_last_frame) {
+      copy_frame(denoiser->running_avg_y[LAST_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+  }
 }

-void vp9_denoiser_update_frame_stats() {
-  return;
+void vp9_denoiser_reset_frame_stats(VP9_DENOISER *denoiser) {
+  denoiser->zero_mv_sse = UINT_MAX;
+  denoiser->best_sse = UINT_MAX;
+}
+
+void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
+                                     unsigned int sse, PREDICTION_MODE mode) {
+  // TODO(tkopp): Use both MVs if possible
+  if (mbmi->mv[0].as_int == 0 && sse < denoiser->zero_mv_sse) {
+    denoiser->zero_mv_sse = sse;
+    denoiser->best_zeromv_reference_frame = mbmi->ref_frame[0];
+  }
+
+  if (mbmi->mv[0].as_int != 0 && sse < denoiser->best_sse) {
+    denoiser->best_sse = sse;
+    denoiser->best_sse_inter_mode = mode;
+    denoiser->best_sse_mv = mbmi->mv[0];
+    denoiser->best_reference_frame = mbmi->ref_frame[0];
+  }
 }

 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int border) {
+                       int ssx, int ssy, int border) {
+  int i, fail;
+  assert(denoiser != NULL);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
+                                  ssx, ssy, border);
+    if (fail) {
+      vp9_denoiser_free(denoiser);
+      return 1;
+    }
+#ifdef OUTPUT_YUV_DENOISED
+    make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  }
+
+  fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
+                                ssx, ssy, border);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
+  }
+#ifdef OUTPUT_YUV_DENOISED
+  make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  denoiser->increase_denoising = 0;
+
  return 0;
 }

 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
-  return;
+  int i;
+  if (denoiser == NULL) {
+    return;
+  }
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    if (&denoiser->running_avg_y[i] != NULL) {
+      vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
+    }
+  }
+  if (&denoiser->mc_running_avg_y != NULL) {
+    vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
+  }
 }

+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+  int r, c;
+  uint8_t *u = yuv->u_buffer;
+  uint8_t *v = yuv->v_buffer;
+
+  // The '/2's are there because we have a 440 buffer, but we want to output
+  // 420.
+  for (r = 0; r < yuv->uv_height / 2; ++r) {
+    for (c = 0; c < yuv->uv_width / 2; ++c) {
+      u[c] = UINT8_MAX / 2;
+      v[c] = UINT8_MAX / 2;
+    }
+    u += yuv->uv_stride + yuv->uv_width / 2;
+    v += yuv->uv_stride + yuv->uv_width / 2;
+  }
+}
+#endif
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -12,35 +12,47 @@
 #define VP9_ENCODER_DENOISER_H_

 #include "vp9/encoder/vp9_block.h"
+#include "vpx_scale/yv12config.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

-enum vp9_denoiser_decision {
+typedef enum vp9_denoiser_decision {
  COPY_BLOCK,
  FILTER_BLOCK
-};
+} VP9_DENOISER_DECISION;

 typedef struct vp9_denoiser {
-  struct buf_2d running_avg_y;
-  struct buf_2d mc_running_avg_y;
+  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
+  YV12_BUFFER_CONFIG mc_running_avg_y;
+
+  unsigned int zero_mv_sse;
+  unsigned int best_sse;
+  int increase_denoising;
+  PREDICTION_MODE best_sse_inter_mode;
+  int_mv best_sse_mv;
+  MV_REFERENCE_FRAME best_reference_frame;
+  MV_REFERENCE_FRAME best_zeromv_reference_frame;
 } VP9_DENOISER;

 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
                                    FRAME_TYPE frame_type,
                                    int refresh_alt_ref_frame,
                                    int refresh_golden_frame,
                                    int refresh_last_frame);

-void vp9_denoiser_denoise(VP9_DENOISER *denoiser,
-                          MACROBLOCK *mb, MODE_INFO **grid,
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                          int mi_row, int mi_col, BLOCK_SIZE bs);

-void vp9_denoiser_update_frame_stats();
+void vp9_denoiser_reset_frame_stats(VP9_DENOISER *denoiser);
+
+void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
+                                     unsigned int sse, PREDICTION_MODE mode);

 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int border);
+                       int ssx, int ssy, int border);

 void vp9_denoiser_free(VP9_DENOISER *denoiser);

--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -70,12 +70,6 @@ static const uint8_t VP9_VAR_OFFS[64] = {
  128, 128, 128, 128, 128, 128, 128, 128
 };

-typedef struct {
-  unsigned int sse;
-  int sum;
-  unsigned int var;
-} diff;
-
 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
                                              const struct buf_2d *ref,
                                              BLOCK_SIZE bs) {
@@ -139,42 +133,6 @@ static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm,
  xd->mi[0] = cm->mi + idx_str;
 }

-static int is_block_in_mb_map(const VP9_COMP *cpi, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const int mb_rows = cm->mb_rows;
-  const int mb_cols = cm->mb_cols;
-  const int mb_row = mi_row >> 1;
-  const int mb_col = mi_col >> 1;
-  const int mb_width = num_8x8_blocks_wide_lookup[bsize] >> 1;
-  const int mb_height = num_8x8_blocks_high_lookup[bsize] >> 1;
-  int r, c;
-  if (bsize <= BLOCK_16X16) {
-    return cpi->active_map[mb_row * mb_cols + mb_col];
-  }
-  for (r = 0; r < mb_height; ++r) {
-    for (c = 0; c < mb_width; ++c) {
-      int row = mb_row + r;
-      int col = mb_col + c;
-      if (row >= mb_rows || col >= mb_cols)
-        continue;
-      if (cpi->active_map[row * mb_cols + col])
-        return 1;
-    }
-  }
-  return 0;
-}
-
-static int check_active_map(const VP9_COMP *cpi, const MACROBLOCK *x,
-                            int mi_row, int mi_col,
-                            BLOCK_SIZE bsize) {
-  if (cpi->active_map_enabled && !x->e_mbd.lossless) {
-    return is_block_in_mb_map(cpi, mi_row, mi_col, bsize);
-  } else {
-    return 1;
-  }
-}
-
 static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
                        int mi_row, int mi_col, BLOCK_SIZE bsize) {
  MACROBLOCK *const x = &cpi->mb;
@@ -187,9 +145,6 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,

  set_skip_context(xd, mi_row, mi_col);

-  // Activity map pointer
-  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
-
  set_modeinfo_offsets(cm, xd, mi_row, mi_col);

  mbmi = &xd->mi[0]->mbmi;
@@ -723,7 +678,6 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,

  xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
  x->skip = 1;
-  x->skip_encode = 1;

  *rate = 0;
  *dist = 0;
@@ -822,12 +776,17 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
                              best_rd);
  } else {
-    if (bsize >= BLOCK_8X8)
-      vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
-                                totalrate, totaldist, bsize, ctx, best_rd);
-    else
+    if (bsize >= BLOCK_8X8) {
+      if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, totalrate, totaldist, bsize,
+                                           ctx, best_rd);
+      else
+        vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+                                  totalrate, totaldist, bsize, ctx, best_rd);
+    } else {
      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
                                    totaldist, bsize, ctx, best_rd);
+    }
  }

  x->rdmult = orig_rdmult;
@@ -1175,7 +1134,6 @@ static void constrain_copy_partitioning(VP9_COMP *const cpi,
  }
 }

-
 const struct {
  int row;
  int col;
@@ -1208,34 +1166,26 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
  // In-image SB64
  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    const int src_stride = x->plane[0].src.stride;
-    const int pre_stride = cpi->Last_Source->y_stride;
-    const uint8_t *src = x->plane[0].src.buf;
-    const int pre_offset = (mi_row * MI_SIZE) * pre_stride +
-                           (mi_col * MI_SIZE);
-    const uint8_t *pre_src = cpi->Last_Source->y_buffer + pre_offset;
-    const unsigned int thr_32x32 = cpi->sf.source_var_thresh;
-    const unsigned int thr_64x64 = thr_32x32 << 1;
    int i, j;
    int index;
    diff d32[4];
-    int use16x16 = 0;
+    const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
+    int is_larger_better = 0;
+    int use32x32 = 0;
+    unsigned int thr = cpi->source_var_thresh;
+
+    vpx_memset(d32, 0, 4 * sizeof(diff));

    for (i = 0; i < 4; i++) {
-      diff d16[4];
+      diff *d16[4];

      for (j = 0; j < 4; j++) {
        int b_mi_row = coord_lookup[i * 4 + j].row;
        int b_mi_col = coord_lookup[i * 4 + j].col;
-        int b_offset = b_mi_row * MI_SIZE * src_stride +
-                       b_mi_col * MI_SIZE;
+        int boffset = b_mi_row / 2 * cm->mb_cols +
+                      b_mi_col / 2;

-        vp9_get16x16var(src + b_offset, src_stride,
-                        pre_src + b_offset, pre_stride,
-                        &d16[j].sse, &d16[j].sum);
-
-        d16[j].var = d16[j].sse -
-            (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
+        d16[j] = cpi->source_diff_var + offset + boffset;

        index = b_mi_row * mis + b_mi_col;
        mi_8x8[index] = mi_upper_left + index;
@@ -1245,14 +1195,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
        // size to further improve quality.
      }

-      if (d16[0].var < thr_32x32 && d16[1].var < thr_32x32 &&
-          d16[2].var < thr_32x32 && d16[3].var < thr_32x32) {
-        d32[i].sse = d16[0].sse;
-        d32[i].sum = d16[0].sum;
+      is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) &&
+          (d16[2]->var < thr) && (d16[3]->var < thr);

-        for (j = 1; j < 4; j++) {
-          d32[i].sse += d16[j].sse;
-          d32[i].sum += d16[j].sum;
+      // Use 32x32 partition
+      if (is_larger_better) {
+        use32x32 += 1;
+
+        for (j = 0; j < 4; j++) {
+          d32[i].sse += d16[j]->sse;
+          d32[i].sum += d16[j]->sum;
        }

        d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10);
@@ -1260,18 +1212,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
        index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
        mi_8x8[index] = mi_upper_left + index;
        mi_8x8[index]->mbmi.sb_type = BLOCK_32X32;
-
-        if (!((cm->current_video_frame - 1) %
-            cpi->sf.search_type_check_frequency))
-          cpi->use_large_partition_rate += 1;
-      } else {
-        use16x16 = 1;
      }
    }

-    if (!use16x16) {
-      if (d32[0].var < thr_64x64 && d32[1].var < thr_64x64 &&
-          d32[2].var < thr_64x64 && d32[3].var < thr_64x64)  {
+    if (use32x32 == 4) {
+      thr <<= 1;
+      is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) &&
+          (d32[2].var < thr) && (d32[3].var < thr);
+
+      // Use 64x64 partition
+      if (is_larger_better) {
        mi_8x8[0] = mi_upper_left;
        mi_8x8[0]->mbmi.sb_type = BLOCK_64X64;
      }
@@ -1508,20 +1458,8 @@ static void rd_use_partition(VP9_COMP *cpi,
  if (bsize == BLOCK_16X16) {
    set_offsets(cpi, tile, mi_row, mi_col, bsize);
    x->mb_energy = vp9_block_energy(cpi, x, bsize);
-  } else {
-    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
  }

-  if (!x->in_active_map) {
-    do_partition_search = 0;
-    if (mi_row + (mi_step >> 1) < cm->mi_rows &&
-        mi_col + (mi_step >> 1) < cm->mi_cols) {
-      pc_tree->partitioning = PARTITION_NONE;
-      bs_type = mi_8x8[0]->mbmi.sb_type = bsize;
-      subsize = bsize;
-      partition = PARTITION_NONE;
-    }
-  }
  if (do_partition_search &&
      cpi->sf.partition_search_type == SEARCH_PARTITION &&
      cpi->sf.adjust_partitioning_from_last_frame) {
@@ -1984,8 +1922,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
  if (bsize == BLOCK_16X16) {
    set_offsets(cpi, tile, mi_row, mi_col, bsize);
    x->mb_energy = vp9_block_energy(cpi, x, bsize);
-  } else {
-    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
  }
  // Determine partition types in search according to the speed features.
  // The threshold set here has to be of square block size.
@@ -2018,8 +1954,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
    }
  }

-  if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed))
-    do_split = 0;
  // PARTITION_NONE
  if (partition_none_allowed) {
    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
@@ -2053,10 +1987,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
        }
      }
    }
-    if (!x->in_active_map) {
-      do_split = 0;
-      do_rect = 0;
-    }
    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
  }

@@ -2310,7 +2240,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                               sf->always_this_block_size);
        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
+      } else if (cpi->skippable_frame ||
+                 sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
        BLOCK_SIZE bsize;
        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
        bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
@@ -2322,8 +2253,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
      } else {
+        GF_GROUP * gf_grp = &cpi->twopass.gf_group;
+        int last_was_mid_sequence_overlay = 0;
+        if ((cpi->pass == 2) && (gf_grp->index)) {
+          if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE)
+            last_was_mid_sequence_overlay = 1;
+        }
        if ((cm->current_video_frame
            % sf->last_partitioning_redo_frequency) == 0
+            || last_was_mid_sequence_overlay
            || cm->prev_mi == 0
            || cm->show_frame == 0
            || cm->frame_type == KEY_FRAME
@@ -2436,7 +2374,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
                 rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ?
                     ALLOW_32X32 : TX_MODE_SELECT;
    } else if (cpi->sf.tx_size_search_method == USE_TX_8X8) {
-      return ALLOW_8X8;
+      return TX_MODE_SELECT;
    } else {
      unsigned int total = 0;
      int i;
@@ -2453,20 +2391,6 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
  }
 }

-static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                          PREDICTION_MODE mode) {
-  mbmi->mode = mode;
-  mbmi->uv_mode = mode;
-  mbmi->mv[0].as_int = 0;
-  mbmi->mv[1].as_int = 0;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->ref_frame[1] = NONE;
-  mbmi->tx_size = max_txsize_lookup[bsize];
-  mbmi->skip = 0;
-  mbmi->sb_type = bsize;
-  mbmi->segment_id = 0;
-}
-
 static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
                                int mi_row, int mi_col,
                                int *rate, int64_t *dist,
@@ -2479,19 +2403,15 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
  mbmi = &xd->mi[0]->mbmi;
  mbmi->sb_type = bsize;

-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
    if (mbmi->segment_id && x->in_static_area)
      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
-  }

-  if (!frame_is_intra_only(cm)) {
-    if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-      set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
-    else
-      vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize);
-  } else {
-    set_mode_info(&xd->mi[0]->mbmi, bsize, DC_PRED);
-  }
+  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+    set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+  else
+    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize);
+
  duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
 }

@@ -2586,8 +2506,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
  assert(num_8x8_blocks_wide_lookup[bsize] ==
             num_8x8_blocks_high_lookup[bsize]);

-  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
-
  // Determine partition types in search according to the speed features.
  // The threshold set here has to be of square block size.
  if (cpi->sf.auto_min_max_partition_size) {
@@ -2606,15 +2524,13 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
    partition_vert_allowed &= force_vert_split;
  }

-  if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed))
-    do_split = 0;
-
  // PARTITION_NONE
  if (partition_none_allowed) {
    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
                        &this_rate, &this_dist, bsize);
    ctx->mic.mbmi = xd->mi[0]->mbmi;
    ctx->skip_txfm = x->skip_txfm;
+    ctx->skip = x->skip;

    if (this_rate != INT_MAX) {
      int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2643,10 +2559,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
        }
      }
    }
-    if (!x->in_active_map) {
-      do_split = 0;
-      do_rect = 0;
-    }
  }

  // store estimated motion vector
@@ -2702,6 +2614,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,

    pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
    pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
+    pc_tree->horizontal[0].skip = x->skip;

    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);

@@ -2712,6 +2625,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,

      pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
      pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
+      pc_tree->horizontal[1].skip = x->skip;

      if (this_rate == INT_MAX) {
        sum_rd = INT64_MAX;
@@ -2742,6 +2656,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                        &this_rate, &this_dist, subsize);
    pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
    pc_tree->vertical[0].skip_txfm = x->skip_txfm;
+    pc_tree->vertical[0].skip = x->skip;
    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
      load_pred_mv(x, ctx);
@@ -2749,6 +2664,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                          &this_rate, &this_dist, subsize);
      pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
      pc_tree->vertical[1].skip_txfm = x->skip_txfm;
+      pc_tree->vertical[1].skip = x->skip;
      if (this_rate == INT_MAX) {
        sum_rd = INT64_MAX;
      } else {
@@ -2838,16 +2754,19 @@ static void nonrd_use_partition(VP9_COMP *cpi,
      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
      pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
      pc_tree->none.skip_txfm = x->skip_txfm;
+      pc_tree->none.skip = x->skip;
      break;
    case PARTITION_VERT:
      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
      pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
      pc_tree->vertical[0].skip_txfm = x->skip_txfm;
+      pc_tree->vertical[0].skip = x->skip;
      if (mi_col + hbs < cm->mi_cols) {
        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
                            &rate, &dist, subsize);
        pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
        pc_tree->vertical[1].skip_txfm = x->skip_txfm;
+        pc_tree->vertical[1].skip = x->skip;
        if (rate != INT_MAX && dist != INT64_MAX &&
            *totrate != INT_MAX && *totdist != INT64_MAX) {
          *totrate += rate;
@@ -2859,11 +2778,13 @@ static void nonrd_use_partition(VP9_COMP *cpi,
      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
      pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
      pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
+      pc_tree->horizontal[0].skip = x->skip;
      if (mi_row + hbs < cm->mi_rows) {
        nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
                            &rate, &dist, subsize);
        pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
        pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
+        pc_tree->horizontal[1].skip = x->skip;
        if (rate != INT_MAX && dist != INT64_MAX &&
            *totrate != INT_MAX && *totdist != INT64_MAX) {
          *totrate += rate;
@@ -2984,6 +2905,93 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 }
 // end RTC play code

+static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+
+  const uint8_t *src = cpi->Source->y_buffer;
+  const uint8_t *last_src = cpi->Last_Source->y_buffer;
+  const int src_stride = cpi->Source->y_stride;
+  const int last_stride = cpi->Last_Source->y_stride;
+
+  // Pick cutoff threshold
+  const int cutoff = (MIN(cm->width, cm->height) >= 720) ?
+      (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) :
+      (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
+  DECLARE_ALIGNED_ARRAY(16, int, hist, VAR_HIST_BINS);
+  diff *var16 = cpi->source_diff_var;
+
+  int sum = 0;
+  int i, j;
+
+  vpx_memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
+
+  for (i = 0; i < cm->mb_rows; i++) {
+    for (j = 0; j < cm->mb_cols; j++) {
+      vp9_get16x16var(src, src_stride, last_src, last_stride,
+                      &var16->sse, &var16->sum);
+
+      var16->var = var16->sse -
+          (((uint32_t)var16->sum * var16->sum) >> 8);
+
+      if (var16->var >= VAR_HIST_MAX_BG_VAR)
+        hist[VAR_HIST_BINS - 1]++;
+      else
+        hist[var16->var / VAR_HIST_FACTOR]++;
+
+      src += 16;
+      last_src += 16;
+      var16++;
+    }
+
+    src = src - cm->mb_cols * 16 + 16 * src_stride;
+    last_src = last_src - cm->mb_cols * 16 + 16 * last_stride;
+  }
+
+  cpi->source_var_thresh = 0;
+
+  if (hist[VAR_HIST_BINS - 1] < cutoff) {
+    for (i = 0; i < VAR_HIST_BINS - 1; i++) {
+      sum += hist[i];
+
+      if (sum > cutoff) {
+        cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR;
+        return 0;
+      }
+    }
+  }
+
+  return sf->search_type_check_frequency;
+}
+
+static void source_var_based_partition_search_method(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  if (cm->frame_type == KEY_FRAME) {
+    // For key frame, use SEARCH_PARTITION.
+    sf->partition_search_type = SEARCH_PARTITION;
+  } else if (cm->intra_only) {
+    sf->partition_search_type = FIXED_PARTITION;
+  } else {
+    if (cm->last_width != cm->width || cm->last_height != cm->height) {
+      if (cpi->source_diff_var)
+        vpx_free(cpi->source_diff_var);
+
+        CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                        vpx_calloc(cm->MBs, sizeof(diff)));
+      }
+
+    if (!cpi->frames_till_next_var_check)
+      cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi);
+
+    if (cpi->frames_till_next_var_check > 0) {
+      sf->partition_search_type = FIXED_PARTITION;
+      cpi->frames_till_next_var_check--;
+    }
+  }
+}
+
 static int get_skip_encode_frame(const VP9_COMMON *cm) {
  unsigned int intra_count = 0, inter_count = 0;
  int j;
@@ -3014,7 +3022,7 @@ static void encode_tiles(VP9_COMP *cpi) {
      vp9_tile_init(&tile, cm, tile_row, tile_col);
      for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
           mi_row += MI_BLOCK_SIZE) {
-        if (cpi->sf.use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
+        if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm))
          encode_nonrd_sb_row(cpi, &tile, mi_row, &tok);
        else
          encode_rd_sb_row(cpi, &tile, mi_row, &tok);
@@ -3066,6 +3074,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
  init_encode_frame_mb_context(cpi);
  set_prev_mi(cm);

+  x->quant_fp = cpi->sf.use_quant_fp;
  x->skip_txfm = 0;
  if (sf->use_nonrd_pick_mode) {
    // Initialize internal buffer pointers for rtc coding, where non-RD
@@ -3083,28 +3092,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
    }
    vp9_zero(x->zcoeff_blk);

-    if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION &&
-        cm->current_video_frame > 0) {
-      int check_freq = sf->search_type_check_frequency;
-
-      if ((cm->current_video_frame - 1) % check_freq == 0) {
-        cpi->use_large_partition_rate = 0;
-      }
-
-      if ((cm->current_video_frame - 1) % check_freq == 1) {
-        const int mbs_in_b32x32 = 1 << ((b_width_log2_lookup[BLOCK_32X32] -
-                                  b_width_log2_lookup[BLOCK_16X16]) +
-                                  (b_height_log2_lookup[BLOCK_32X32] -
-                                  b_height_log2_lookup[BLOCK_16X16]));
-        cpi->use_large_partition_rate = cpi->use_large_partition_rate * 100 *
-                                        mbs_in_b32x32 / cm->MBs;
-      }
-
-      if ((cm->current_video_frame - 1) % check_freq >= 1) {
-        if (cpi->use_large_partition_rate < 15)
-          sf->partition_search_type = FIXED_PARTITION;
-      }
-    }
+    if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
+      source_var_based_partition_search_method(cpi);
  }

  {
@@ -3269,7 +3258,6 @@ void vp9_encode_frame(VP9_COMP *cpi) {
    }
  } else {
    cm->reference_mode = SINGLE_REFERENCE;
-    cm->interp_filter = SWITCHABLE;
    encode_frame_internal(cpi);
  }
 }
@@ -3364,7 +3352,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
      vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                           &xd->block_refs[ref]->sf);
    }
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+    if (!cpi->sf.reuse_inter_pred_sby)
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+
+    vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));

    if (!x->skip) {
      mbmi->skip = 1;
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -20,6 +20,13 @@ struct macroblock;
 struct yv12_buffer_config;
 struct VP9_COMP;

+// Constants used in SOURCE_VAR_BASED_PARTITION
+#define VAR_HIST_MAX_BG_VAR 1000
+#define VAR_HIST_FACTOR 10
+#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
+#define VAR_HIST_LARGE_CUT_OFF 75
+#define VAR_HIST_SMALL_CUT_OFF 45
+
 void vp9_setup_src_planes(struct macroblock *x,
                          const struct yv12_buffer_config *src,
                          int mi_row, int mi_col);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -306,6 +306,56 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
  MACROBLOCKD *const xd = &x->e_mbd;
  const struct macroblock_plane *const p = &x->plane[plane];
  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
+  const int16_t *src_diff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+    case TX_16X16:
+      vp9_fdct16x16(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, p->zbin_extra, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vp9_fdct8x8(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, p->zbin_extra, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, p->zbin_extra, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -424,11 +474,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,

  if (x->skip_txfm == 0) {
    // full forward transform and quantization
-    if (!x->skip_recode)
-      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    if (!x->skip_recode) {
+      if (x->quant_fp)
+        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+      else
+        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    }
  } else if (x->skip_txfm == 2) {
    // fast path forward transform and quantization
-    vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+    vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
  } else {
    // skip forward transform
    p->eobs[block] = 0;
@@ -507,7 +561,7 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {

    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
      const struct macroblockd_plane* const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
      vp9_get_entropy_contexts(bsize, tx_size, pd,
                               ctx.ta[plane], ctx.tl[plane]);
    }
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -24,6 +24,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);

--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
@@ -46,9 +47,6 @@ extern "C" {

 #define DEFAULT_GF_INTERVAL         10

-#define MAX_MODES 30
-#define MAX_REFS  6
-
 typedef struct {
  int nmvjointcost[MV_JOINTS];
  int nmvcosts[2][MV_VALS];
@@ -66,56 +64,6 @@ typedef struct {
  FRAME_CONTEXT fc;
 } CODING_CONTEXT;

-// This enumerator type needs to be kept aligned with the mode order in
-// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
-typedef enum {
-  THR_NEARESTMV,
-  THR_NEARESTA,
-  THR_NEARESTG,
-
-  THR_DC,
-
-  THR_NEWMV,
-  THR_NEWA,
-  THR_NEWG,
-
-  THR_NEARMV,
-  THR_NEARA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARESTGA,
-
-  THR_TM,
-
-  THR_COMP_NEARLA,
-  THR_COMP_NEWLA,
-  THR_NEARG,
-  THR_COMP_NEARGA,
-  THR_COMP_NEWGA,
-
-  THR_ZEROMV,
-  THR_ZEROG,
-  THR_ZEROA,
-  THR_COMP_ZEROLA,
-  THR_COMP_ZEROGA,
-
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D207_PRED,
-  THR_D153_PRED,
-  THR_D63_PRED,
-  THR_D117_PRED,
-  THR_D45_PRED,
-} THR_MODES;
-
-typedef enum {
-  THR_LAST,
-  THR_GOLD,
-  THR_ALTR,
-  THR_COMP_LA,
-  THR_COMP_GA,
-  THR_INTRA,
-} THR_MODES_SUB8X8;

 typedef enum {
  // encode_breakout is disabled.
@@ -293,32 +241,6 @@ static INLINE int is_best_mode(MODE mode) {
  return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
 }

-typedef struct RD_OPT {
-  // Thresh_mult is used to set a threshold for the rd score. A higher value
-  // means that we will accept the best mode so far more often. This number
-  // is used in combination with the current block size, and thresh_freq_fact
-  // to pick a threshold.
-  int thresh_mult[MAX_MODES];
-  int thresh_mult_sub8x8[MAX_REFS];
-
-  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
-  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-
-  int64_t comp_pred_diff[REFERENCE_MODES];
-  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
-  int64_t tx_select_diff[TX_MODES];
-  // FIXME(rbultje) can this overflow?
-  int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
-
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t mask_filter;
-
-  int RDMULT;
-  int RDDIV;
-} RD_OPT;
-
 typedef struct VP9_COMP {
  QUANTS quants;
  MACROBLOCK mb;
@@ -326,11 +248,7 @@ typedef struct VP9_COMP {
  VP9EncoderConfig oxcf;
  struct lookahead_ctx    *lookahead;
  struct lookahead_entry  *source;
-#if CONFIG_MULTIPLE_ARF
-  struct lookahead_entry  *alt_ref_source[REF_FRAMES];
-#else
  struct lookahead_entry  *alt_ref_source;
-#endif
  struct lookahead_entry  *last_source;

  YV12_BUFFER_CONFIG *Source;
@@ -344,14 +262,13 @@ typedef struct VP9_COMP {
  int alt_is_last;  // Alt same as last ( short circuit altref search)
  int gold_is_alt;  // don't do both alt and gold search ( just do gold).

+  int skippable_frame;
+
  int scaled_ref_idx[3];
  int lst_fb_idx;
  int gld_fb_idx;
  int alt_fb_idx;

-#if CONFIG_MULTIPLE_ARF
-  int alt_ref_fb_idx[REF_FRAMES - 3];
-#endif
  int refresh_last_frame;
  int refresh_golden_frame;
  int refresh_alt_ref_frame;
@@ -369,13 +286,6 @@ typedef struct VP9_COMP {
  TOKENEXTRA *tok;
  unsigned int tok_count[4][1 << 6];

-#if CONFIG_MULTIPLE_ARF
-  // Position within a frame coding order (including any additional ARF frames).
-  unsigned int sequence_number;
-  // Next frame in naturally occurring order that has not yet been coded.
-  int next_frame_in_order;
-#endif
-
  // Ambient reconstruction err target for force key frames
  int ambient_err;

@@ -425,9 +335,6 @@ typedef struct VP9_COMP {

  unsigned char *complexity_map;

-  unsigned char *active_map;
-  unsigned int active_map_enabled;
-
  CYCLIC_REFRESH *cyclic_refresh;

  fractional_mv_step_fp *find_fractional_mv_step;
@@ -441,6 +348,10 @@ typedef struct VP9_COMP {
  uint64_t time_pick_lpf;
  uint64_t time_encode_sb_row;

+#if CONFIG_FP_MB_STATS
+  int use_fp_mb_stats;
+#endif
+
  TWO_PASS twopass;

  YV12_BUFFER_CONFIG alt_ref_buffer;
@@ -494,7 +405,11 @@ typedef struct VP9_COMP {

  SVC svc;

-  int use_large_partition_rate;
+  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
+  diff *source_diff_var;
+  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+  unsigned int source_var_thresh;
+  int frames_till_next_var_check;

  int frame_flags;

@@ -511,18 +426,8 @@ typedef struct VP9_COMP {
  PC_TREE *pc_root;
  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];

-#if CONFIG_MULTIPLE_ARF
-  // ARF tracking variables.
+  int multi_arf_allowed;
  int multi_arf_enabled;
-  unsigned int frame_coding_order_period;
-  unsigned int new_frame_coding_order_period;
-  int frame_coding_order[MAX_LAG_BUFFERS * 2];
-  int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2];
-  int arf_weight[MAX_LAG_BUFFERS];
-  int arf_buffered;
-  int this_frame_weight;
-  int max_arf_level;
-#endif

 #if CONFIG_DENOISING
  VP9_DENOISER denoiser;
@@ -531,7 +436,8 @@ typedef struct VP9_COMP {

 void vp9_initialize_enc();

-struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                       BufferPool *const pool);
 void vp9_remove_compressor(VP9_COMP *cpi);

 void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
@@ -589,8 +495,9 @@ static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,

 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON * const cm = &cpi->common;
-  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
+  VP9_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  return &pool->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
      .buf;
 }

@@ -622,10 +529,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi);

 int64_t vp9_rescale(int64_t val, int64_t num, int denom);

+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                          YV12_BUFFER_CONFIG *unscaled,
                                          YV12_BUFFER_CONFIG *scaled);

+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
+
 static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
                                MV_REFERENCE_FRAME ref0,
                                MV_REFERENCE_FRAME ref1) {
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -33,7 +33,6 @@
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_variance.h"

@@ -56,14 +55,7 @@
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)

 #define MIN_KF_BOOST        300
-
-#if CONFIG_MULTIPLE_ARF
-// Set MIN_GF_INTERVAL to 1 for the full decomposition.
-#define MIN_GF_INTERVAL             2
-#else
-#define MIN_GF_INTERVAL             4
-#endif
-
+#define MIN_GF_INTERVAL     4
 #define LONG_TERM_VBR_CORRECTION

 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
@@ -114,6 +106,34 @@ static int read_frame_stats(const TWO_PASS *p,
  return 1;
 }

+#if CONFIG_FP_MB_STATS
+static int input_mb_stats(FIRSTPASS_FRAME_MB_STATS *fp_frame_stats,
+                          const VP9_COMMON *const cm) {
+  FILE *fpfile;
+  int ret;
+
+  fpfile = fopen("firstpass_mb.stt", "r");
+  fseek(fpfile, cm->current_video_frame * cm->MBs * sizeof(FIRSTPASS_MB_STATS),
+        SEEK_SET);
+  ret = fread(fp_frame_stats->mb_stats, sizeof(FIRSTPASS_MB_STATS), cm->MBs,
+              fpfile);
+  fclose(fpfile);
+  if (ret < cm->MBs) {
+    return EOF;
+  }
+  return 1;
+}
+
+static void output_mb_stats(FIRSTPASS_FRAME_MB_STATS *fp_frame_stats,
+                          const VP9_COMMON *const cm) {
+  FILE *fpfile;
+
+  fpfile = fopen("firstpass_mb.stt", "a");
+  fwrite(fp_frame_stats->mb_stats, sizeof(FIRSTPASS_MB_STATS), cm->MBs, fpfile);
+  fclose(fpfile);
+}
+#endif
+
 static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
  if (p->stats_in >= p->stats_in_end)
    return EOF;
@@ -460,6 +480,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
  const MV zero_mv = {0, 0};
  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;

+#if CONFIG_FP_MB_STATS
+  FIRSTPASS_FRAME_MB_STATS *this_frame_mb_stats = &twopass->this_frame_mb_stats;
+#endif
+
  vp9_clear_system_state();

  set_first_pass_params(cpi);
@@ -497,6 +521,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
                                        &cpi->scaled_source);
  }

+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
  vp9_setup_src_planes(x, cpi->Source, 0, 0);
  vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
@@ -504,8 +530,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
  xd->mi = cm->mi_grid_visible;
  xd->mi[0] = cm->mi;

-  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
  vp9_frame_init_quantizer(cpi);

  for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -587,6 +611,17 @@ void vp9_first_pass(VP9_COMP *cpi) {
      // Accumulate the intra error.
      intra_error += (int64_t)this_error;

+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mode =
+            DC_PRED;
+        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].err =
+            this_error;
+        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mv.as_int
+            = 0;
+      }
+#endif
+
      // Set up limit values for motion vectors to prevent them extending
      // outside the UMV borders.
      x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
@@ -615,7 +650,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
                                                &unscaled_last_source_buf_2d);

        // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25) {
+        if (raw_motion_error > 25 ||
+            (cpi->use_svc && cpi->svc.number_temporal_layers == 1)) {
          // Test last reference frame using the previous best mv as the
          // starting point (best reference) for the search.
          first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
@@ -711,6 +747,17 @@ void vp9_first_pass(VP9_COMP *cpi) {

          best_ref_mv.as_int = mv.as_int;

+#if CONFIG_FP_MB_STATS
+          if (cpi->use_fp_mb_stats) {
+            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mode =
+                NEWMV;
+            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].err =
+                motion_error;
+            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mv.
+                as_int = mv.as_int;
+          }
+#endif
+
          if (mv.as_int) {
            ++mvcount;

@@ -815,6 +862,12 @@ void vp9_first_pass(VP9_COMP *cpi) {
    twopass->this_frame_stats = fps;
    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
    accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      output_mb_stats(this_frame_mb_stats, cm);
+    }
+#endif
  }

  // Copy the previous Last Frame back into gf and and arf buffers if
@@ -1221,144 +1274,6 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
  return arf_boost;
 }

-#if CONFIG_MULTIPLE_ARF
-// Work out the frame coding order for a GF or an ARF group.
-// The current implementation codes frames in their natural order for a
-// GF group, and inserts additional ARFs into an ARF group using a
-// binary split approach.
-// NOTE: this function is currently implemented recursively.
-static void schedule_frames(VP9_COMP *cpi, const int start, const int end,
-                            const int arf_idx, const int gf_or_arf_group,
-                            const int level) {
-  int i, abs_end, half_range;
-  int *cfo = cpi->frame_coding_order;
-  int idx = cpi->new_frame_coding_order_period;
-
-  // If (end < 0) an ARF should be coded at position (-end).
-  assert(start >= 0);
-
-  // printf("start:%d end:%d\n", start, end);
-
-  // GF Group: code frames in logical order.
-  if (gf_or_arf_group == 0) {
-    assert(end >= start);
-    for (i = start; i <= end; ++i) {
-      cfo[idx] = i;
-      cpi->arf_buffer_idx[idx] = arf_idx;
-      cpi->arf_weight[idx] = -1;
-      ++idx;
-    }
-    cpi->new_frame_coding_order_period = idx;
-    return;
-  }
-
-  // ARF Group: Work out the ARF schedule and mark ARF frames as negative.
-  if (end < 0) {
-    // printf("start:%d end:%d\n", -end, -end);
-    // ARF frame is at the end of the range.
-    cfo[idx] = end;
-    // What ARF buffer does this ARF use as predictor.
-    cpi->arf_buffer_idx[idx] = (arf_idx > 2) ? (arf_idx - 1) : 2;
-    cpi->arf_weight[idx] = level;
-    ++idx;
-    abs_end = -end;
-  } else {
-    abs_end = end;
-  }
-
-  half_range = (abs_end - start) >> 1;
-
-  // ARFs may not be adjacent, they must be separated by at least
-  // MIN_GF_INTERVAL non-ARF frames.
-  if ((start + MIN_GF_INTERVAL) >= (abs_end - MIN_GF_INTERVAL)) {
-    // printf("start:%d end:%d\n", start, abs_end);
-    // Update the coding order and active ARF.
-    for (i = start; i <= abs_end; ++i) {
-      cfo[idx] = i;
-      cpi->arf_buffer_idx[idx] = arf_idx;
-      cpi->arf_weight[idx] = -1;
-      ++idx;
-    }
-    cpi->new_frame_coding_order_period = idx;
-  } else {
-    // Place a new ARF at the mid-point of the range.
-    cpi->new_frame_coding_order_period = idx;
-    schedule_frames(cpi, start, -(start + half_range), arf_idx + 1,
-                    gf_or_arf_group, level + 1);
-    schedule_frames(cpi, start + half_range + 1, abs_end, arf_idx,
-                    gf_or_arf_group, level + 1);
-  }
-}
-
-#define FIXED_ARF_GROUP_SIZE 16
-
-void define_fixed_arf_period(VP9_COMP *cpi) {
-  int i;
-  int max_level = INT_MIN;
-
-  assert(cpi->multi_arf_enabled);
-  assert(cpi->oxcf.lag_in_frames >= FIXED_ARF_GROUP_SIZE);
-
-  // Save the weight of the last frame in the sequence before next
-  // sequence pattern overwrites it.
-  cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
-  assert(cpi->this_frame_weight >= 0);
-
-  cpi->twopass.gf_zeromotion_pct = 0;
-
-  // Initialize frame coding order variables.
-  cpi->new_frame_coding_order_period = 0;
-  cpi->next_frame_in_order = 0;
-  cpi->arf_buffered = 0;
-  vp9_zero(cpi->frame_coding_order);
-  vp9_zero(cpi->arf_buffer_idx);
-  vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
-
-  if (cpi->rc.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {
-    // Setup a GF group close to the keyframe.
-    cpi->rc.source_alt_ref_pending = 0;
-    cpi->rc.baseline_gf_interval = cpi->rc.frames_to_key;
-    schedule_frames(cpi, 0, (cpi->rc.baseline_gf_interval - 1), 2, 0, 0);
-  } else {
-    // Setup a fixed period ARF group.
-    cpi->rc.source_alt_ref_pending = 1;
-    cpi->rc.baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
-    schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
-  }
-
-  // Replace level indicator of -1 with correct level.
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    if (cpi->arf_weight[i] > max_level) {
-      max_level = cpi->arf_weight[i];
-    }
-  }
-  ++max_level;
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    if (cpi->arf_weight[i] == -1) {
-      cpi->arf_weight[i] = max_level;
-    }
-  }
-  cpi->max_arf_level = max_level;
-#if 0
-  printf("\nSchedule: ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->frame_coding_order[i]);
-  }
-  printf("\n");
-  printf("ARFref:   ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->arf_buffer_idx[i]);
-  }
-  printf("\n");
-  printf("Weight:   ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->arf_weight[i]);
-  }
-  printf("\n");
-#endif
-}
-#endif
-
 // Calculate a section intra ratio used in setting max loop filter.
 static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
                                         const FIRSTPASS_STATS *end,
@@ -1428,6 +1343,18 @@ static int calculate_boost_bits(int frame_count,
  return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
 }

+// Current limit on maximum number of active arfs in a GF/ARF group.
+#define MAX_ACTIVE_ARFS 2
+#define ARF_SLOT1 2
+#define ARF_SLOT2 3
+// This function indirects the choice of buffers for arfs.
+// At the moment the values are fixed but this may change as part of
+// the integration process with other codec features that swap buffers around.
+static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
+  arf_buffer_indices[0] = ARF_SLOT1;
+  arf_buffer_indices[1] = ARF_SLOT2;
+}
+
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                   double group_error, int gf_arf_bits) {
  RATE_CONTROL *const rc = &cpi->rc;
@@ -1435,42 +1362,86 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
  TWO_PASS *twopass = &cpi->twopass;
  FIRSTPASS_STATS frame_stats;
  int i;
-  int group_frame_index = 1;
+  int frame_index = 1;
  int target_frame_size;
  int key_frame;
  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
  int64_t total_group_bits = gf_group_bits;
  double modified_err = 0.0;
  double err_fraction;
+  int mid_boost_bits = 0;
+  int mid_frame_idx;
+  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];

  key_frame = cpi->common.frame_type == KEY_FRAME ||
              vp9_is_upper_layer_key_frame(cpi);

+  get_arf_buffer_indices(arf_buffer_indices);
+
  // For key frames the frame target rate is already set and it
  // is also the golden frame.
-  // NOTE: We dont bother to check for the special case of ARF overlay
-  // frames here, as there is clamping code for this in the function
-  // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
-  // encodes.
  if (!key_frame) {
-    twopass->gf_group_bit_allocation[0] = gf_arf_bits;
+    if (rc->source_alt_ref_active) {
+      twopass->gf_group.update_type[0] = OVERLAY_UPDATE;
+      twopass->gf_group.rf_level[0] = INTER_NORMAL;
+      twopass->gf_group.bit_allocation[0] = 0;
+      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
+      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+    } else {
+      twopass->gf_group.update_type[0] = GF_UPDATE;
+      twopass->gf_group.rf_level[0] = GF_ARF_STD;
+      twopass->gf_group.bit_allocation[0] = gf_arf_bits;
+      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
+      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+    }

    // Step over the golden frame / overlay frame
    if (EOF == input_stats(twopass, &frame_stats))
      return;
  }

-  // Store the bits to spend on the ARF if there is one.
-  if (rc->source_alt_ref_pending) {
-    twopass->gf_group_bit_allocation[group_frame_index++] = gf_arf_bits;
-  }
-
-  // Deduct the boost bits for arf or gf if it is not a key frame.
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
  if (rc->source_alt_ref_pending || !key_frame)
    total_group_bits -= gf_arf_bits;

+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    if (cpi->multi_arf_enabled) {
+      // A portion of the gf / arf extra bits are set asside for lower level
+      // boosted frames in the middle of the group.
+      mid_boost_bits += gf_arf_bits >> 5;
+      gf_arf_bits -= (gf_arf_bits >> 5);
+    }
+
+    twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
+    twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits;
+    twopass->gf_group.arf_src_offset[frame_index] =
+      (unsigned char)(rc->baseline_gf_interval - 1);
+    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
+    twopass->gf_group.arf_ref_idx[frame_index] =
+      arf_buffer_indices[cpi->multi_arf_enabled && rc->source_alt_ref_active];
+    ++frame_index;
+
+    if (cpi->multi_arf_enabled) {
+      // Set aside a slot for a level 1 arf.
+      twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
+      twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW;
+      twopass->gf_group.arf_src_offset[frame_index] =
+        (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
+      twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1];
+      twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      ++frame_index;
+    }
+  }
+
+  // Define middle frame
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+
  // Allocate bits to the other frames in the group.
  for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+    int arf_idx = 0;
    if (EOF == input_stats(twopass, &frame_stats))
      break;

@@ -1482,10 +1453,48 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
      err_fraction = 0.0;

    target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      mid_boost_bits += (target_frame_size >> 4);
+      target_frame_size -= (target_frame_size >> 4);
+
+      if (frame_index <= mid_frame_idx)
+        arf_idx = 1;
+    }
+    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+
    target_frame_size = clamp(target_frame_size, 0,
                              MIN(max_bits, (int)total_group_bits));

-    twopass->gf_group_bit_allocation[group_frame_index++] = target_frame_size;
+    twopass->gf_group.update_type[frame_index] = LF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+
+    twopass->gf_group.bit_allocation[frame_index] = target_frame_size;
+    ++frame_index;
+  }
+
+  // Note:
+  // We need to configure the frame at the end of the sequence + 1 that will be
+  // the start frame for the next group. Otherwise prior to the call to
+  // vp9_rc_get_second_pass_params() the data will be undefined.
+  twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
+  twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+
+  if (rc->source_alt_ref_pending) {
+    twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+
+    // Final setup for second arf and its overlay.
+    if (cpi->multi_arf_enabled) {
+      twopass->gf_group.bit_allocation[2] =
+        twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
+      twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
+    }
+  } else {
+    twopass->gf_group.update_type[frame_index] = GF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
  }
 }

@@ -1528,8 +1537,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  // Reset the GF group data structures unless this is a key
  // frame in which case it will already have been done.
  if (cpi->common.frame_type != KEY_FRAME) {
-    twopass->gf_group_index = 0;
-    vp9_zero(twopass->gf_group_bit_allocation);
+    vp9_zero(twopass->gf_group);
  }

  vp9_clear_system_state();
@@ -1651,24 +1659,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    }
  }

-#if CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled) {
-    // Initialize frame coding order variables.
-    cpi->new_frame_coding_order_period = 0;
-    cpi->next_frame_in_order = 0;
-    cpi->arf_buffered = 0;
-    vp9_zero(cpi->frame_coding_order);
-    vp9_zero(cpi->arf_buffer_idx);
-    vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
-  }
-#endif
-
  // Set the interval until the next gf.
  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
    rc->baseline_gf_interval = i - 1;
  else
    rc->baseline_gf_interval = i;

+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
  // Should we use the alternate reference frame.
  if (allow_alt_ref &&
      (i < cpi->oxcf.lag_in_frames) &&
@@ -1681,62 +1679,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                   &b_boost);
    rc->source_alt_ref_pending = 1;

-#if CONFIG_MULTIPLE_ARF
-    // Set the ARF schedule.
-    if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0);
-    }
-#endif
  } else {
    rc->gfu_boost = (int)boost_score;
    rc->source_alt_ref_pending = 0;
-#if CONFIG_MULTIPLE_ARF
-    // Set the GF schedule.
-    if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0);
-      assert(cpi->new_frame_coding_order_period ==
-             rc->baseline_gf_interval);
-    }
-#endif
  }

-#if CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled && (cpi->common.frame_type != KEY_FRAME)) {
-    int max_level = INT_MIN;
-    // Replace level indicator of -1 with correct level.
-    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
-      if (cpi->arf_weight[i] > max_level) {
-        max_level = cpi->arf_weight[i];
-      }
-    }
-    ++max_level;
-    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
-      if (cpi->arf_weight[i] == -1) {
-        cpi->arf_weight[i] = max_level;
-      }
-    }
-    cpi->max_arf_level = max_level;
-  }
-#if 0
-  if (cpi->multi_arf_enabled) {
-    printf("\nSchedule: ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->frame_coding_order[i]);
-    }
-    printf("\n");
-    printf("ARFref:   ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->arf_buffer_idx[i]);
-    }
-    printf("\n");
-    printf("Weight:   ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->arf_weight[i]);
-    }
-    printf("\n");
-  }
-#endif
-#endif
  // Reset the file position.
  reset_fpf_position(twopass, start_pos);

@@ -1886,8 +1833,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  cpi->common.frame_type = KEY_FRAME;

  // Reset the GF group data structures.
-  twopass->gf_group_index = 0;
-  vp9_zero(twopass->gf_group_bit_allocation);
+  vp9_zero(twopass->gf_group);

  // Is this a forced key frame by interval.
  rc->this_key_frame_forced = rc->next_key_frame_forced;
@@ -2078,7 +2024,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  twopass->kf_group_bits -= kf_bits;

  // Save the bits to spend on the key frame.
-  twopass->gf_group_bit_allocation[0] = kf_bits;
+  twopass->gf_group.bit_allocation[0] = kf_bits;
+  twopass->gf_group.update_type[0] = KF_UPDATE;
+  twopass->gf_group.rf_level[0] = KF_STD;

  // Note the total error score of the kf group minus the key frame itself.
  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2106,6 +2054,44 @@ void vbr_rate_correction(int * this_frame_target,
  }
 }

+// Define the reference buffers that will be updated post encode.
+void configure_buffer_updates(VP9_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case GF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
@@ -2130,14 +2116,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
  if (!twopass->stats_in)
    return;

-  // Increment the gf group index.
-  ++twopass->gf_group_index;
-
  // If this is an arf frame then we dont want to read the stats file or
  // advance the input pointer as we already have what we need.
-  if (cpi->refresh_alt_ref_frame) {
+  if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) {
    int target_rate;
-    target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+    configure_buffer_updates(cpi);
+    target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
    rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
@@ -2201,15 +2185,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {

  // Define a new GF/ARF group. (Should always enter here for key frames).
  if (rc->frames_till_gf_update_due == 0) {
-#if CONFIG_MULTIPLE_ARF
-    if (cpi->multi_arf_enabled) {
-      define_fixed_arf_period(cpi);
-    } else {
-#endif
-      define_gf_group(cpi, &this_frame_copy);
-#if CONFIG_MULTIPLE_ARF
-    }
-#endif
+    define_gf_group(cpi, &this_frame_copy);

    if (twopass->gf_zeromotion_pct > 995) {
      // As long as max_thresh for encode breakout is small enough, it is ok
@@ -2233,7 +2209,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
    }
  }

-  target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+  configure_buffer_updates(cpi);
+
+  target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
  if (cpi->common.frame_type == KEY_FRAME)
    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
  else
@@ -2249,6 +2227,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {

  // Update the total stats remaining structure.
  subtract_stats(&twopass->total_left_stats, &this_frame);
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    input_mb_stats(&twopass->this_frame_mb_stats, cm);
+  }
+#endif
 }

 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
@@ -2296,4 +2280,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
    twopass->kf_group_bits -= bits_used;
  }
  twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
+
+  // Increment the gf group index ready for the next frame.
+  ++twopass->gf_group.index;
 }
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -12,11 +12,24 @@
 #define VP9_ENCODER_VP9_FIRSTPASS_H_

 #include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_ratectrl.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

+#if CONFIG_FP_MB_STATS
+typedef struct {
+  PREDICTION_MODE mode;
+  int err;
+  int_mv mv;
+} FIRSTPASS_MB_STATS;
+
+typedef struct {
+  FIRSTPASS_MB_STATS *mb_stats;
+} FIRSTPASS_FRAME_MB_STATS;
+#endif
+
 typedef struct {
  double frame;
  double intra_error;
@@ -39,6 +52,25 @@ typedef struct {
  int64_t spatial_layer_id;
 } FIRSTPASS_STATS;

+typedef enum {
+  KF_UPDATE = 0,
+  LF_UPDATE = 1,
+  GF_UPDATE = 2,
+  ARF_UPDATE = 3,
+  OVERLAY_UPDATE = 4,
+  FRAME_UPDATE_TYPES = 5
+} FRAME_UPDATE_TYPE;
+
+typedef struct {
+  unsigned char index;
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
 typedef struct {
  unsigned int section_intra_rating;
  unsigned int next_iiratio;
@@ -56,6 +88,10 @@ typedef struct {
  double kf_intra_err_min;
  double gf_intra_err_min;

+#if CONFIG_FP_MB_STATS
+  FIRSTPASS_FRAME_MB_STATS this_frame_mb_stats;
+#endif
+
  // Projected total bits available for a key frame group of frames
  int64_t kf_group_bits;

@@ -68,8 +104,7 @@ typedef struct {

  int active_worst_quality;

-  int gf_group_index;
-  int gf_group_bit_allocation[MAX_LAG_BUFFERS * 2];
+  GF_GROUP gf_group;
 } TWO_PASS;

 struct VP9_COMP;
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -18,18 +18,6 @@
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_lookahead.h"

-// The max of past frames we want to keep in the queue.
-#define MAX_PRE_FRAMES 1
-
-struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
-  struct lookahead_entry *buf; /* Buffer list */
-};
-
-
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
                                   unsigned int *idx) {
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -14,6 +14,11 @@
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"

+#ifdef CONFIG_SPATIAL_SVC
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -25,10 +30,22 @@ struct lookahead_entry {
  int64_t             ts_start;
  int64_t             ts_end;
  unsigned int        flags;
+
+#ifdef CONFIG_SPATIAL_SVC
+  vpx_svc_parameters_t svc_params[VPX_SS_MAX_LAYERS];
+#endif
 };

+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1

-struct lookahead_ctx;
+struct lookahead_ctx {
+  unsigned int max_sz;         /* Absolute size of the queue */
+  unsigned int sz;             /* Number of buffers currently in the queue */
+  unsigned int read_idx;       /* Read index */
+  unsigned int write_idx;      /* Write index */
+  struct lookahead_entry *buf; /* Buffer list */
+};

 /**\brief Initializes the lookahead stage
 *
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -144,7 +144,9 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                    : cpi->oxcf.sharpness;

-  if (method == LPF_PICK_FROM_Q) {
+  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+      lf->filter_level = 0;
+  } else if (method >= LPF_PICK_FROM_Q) {
    const int min_filter_level = 0;
    const int max_filter_level = get_max_filter_level(cpi);
    const int q = vp9_ac_quant(cm->base_qindex, 0);
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -23,9 +23,89 @@
 #include "vp9/common/vp9_reconintra.h"

 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"

+static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                       const TileInfo *const tile,
+                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                       int_mv *mv_ref_list,
+                       int mi_row, int mi_col) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+
+  int different_ref_found = 0;
+  int context_counter = 0;
+  int const_motion = 0;
+
+  // Blank the reference vector list
+  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1));
+    }
+  }
+
+  const_motion = 1;
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
+                                                    xd->mi_stride]->mbmi;
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[0]);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found && !refmv_count) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
+                                              * xd->mi_stride]->mbmi;
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV(candidate);
+      }
+    }
+  }
+
+ Done:
+
+  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+
+  // Clamp vectors
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+  return const_motion;
+}
+
 static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
                                    int_mv *tmp_mv, int *rate_mv) {
@@ -172,15 +252,129 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
  else
    x->skip_txfm = 0;

-  // TODO(jingning) This is a temporary solution to account for frames with
-  // light changes. Need to customize the rate-distortion modeling for non-RD
-  // mode decision.
-  if ((sse >> 3) > var)
-    sse = var;
-  vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize],
-                               ac_quant >> 3, &rate, &dist);
-  *out_rate_sum = rate;
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
+                          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      xd->mi[0]->mbmi.tx_size = TX_8X8;
+  } else {
+    xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
+                         tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
+  vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+                               dc_quant >> 3, &rate, &dist);
+  *out_rate_sum = rate >> 1;
  *out_dist_sum = dist << 3;
+
+  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
+  *out_rate_sum += rate;
+  *out_dist_sum += dist << 4;
+}
+
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+  int i;
+
+  for (i = 0; i < len; i++) {
+    if (!p[i].in_use) {
+      p[i].in_use = 1;
+      return i;
+    }
+  }
+  return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+  p->in_use = 0;
+}
+
+static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MV_REFERENCE_FRAME ref_frame,
+                                 PREDICTION_MODE this_mode,
+                                 unsigned int var_y, unsigned int sse_y,
+                                 struct buf_2d yv12_mb[][MAX_MB_PLANE],
+                                 int *rate, int64_t *dist) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+
+  const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+  unsigned int var = var_y, sse = sse_y;
+  // Skipping threshold for ac.
+  unsigned int thresh_ac;
+  // Skipping threshold for dc.
+  unsigned int thresh_dc;
+  if (x->encode_breakout > 0) {
+    // Set a maximum for threshold to avoid big PSNR loss in low bit rate
+    // case. Use extreme low threshold for static frames to limit
+    // skipping.
+    const unsigned int max_thresh = 36000;
+    // The encode_breakout input
+    const unsigned int min_thresh =
+        MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+
+    // Calculate threshold according to dequant value.
+    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+    thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+
+    // Adjust ac threshold according to partition size.
+    thresh_ac >>=
+        8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+    thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+  } else {
+    thresh_ac = 0;
+    thresh_dc = 0;
+  }
+
+  // Y skipping condition checking for ac and dc.
+  if (var <= thresh_ac && (sse - var) <= thresh_dc) {
+    unsigned int sse_u, sse_v;
+    unsigned int var_u, var_v;
+
+    // Skip UV prediction unless breakout is zero (lossless) to save
+    // computation with low impact on the result
+    if (x->encode_breakout == 0) {
+      xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
+      xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
+      vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
+    }
+
+    var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                    x->plane[1].src.stride,
+                                    xd->plane[1].dst.buf,
+                                    xd->plane[1].dst.stride, &sse_u);
+
+    // U skipping condition checking
+    if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+      var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                      x->plane[2].src.stride,
+                                      xd->plane[2].dst.buf,
+                                      xd->plane[2].dst.stride, &sse_v);
+
+      // V skipping condition checking
+      if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+        x->skip = 1;
+
+        // The cost of skip bit needs to be added.
+        *rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                     [INTER_OFFSET(this_mode)];
+
+        // More on this part of rate
+        // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+        // Scaling factor for SSE from spatial domain to frequency
+        // domain is 16. Adjust distortion accordingly.
+        // TODO(yunqingwang): In this function, only y-plane dist is
+        // calculated.
+        *dist = (sse << 4);  // + ((sse_u + sse_v) << 4);
+
+        // *disable_skip = 1;
+      }
+    }
+  }
 }

 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
@@ -197,6 +391,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  struct macroblockd_plane *const pd = &xd->plane[0];
  PREDICTION_MODE this_mode, best_mode = ZEROMV;
  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
+  TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
+                             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
  INTERP_FILTER best_pred_filter = EIGHTTAP;
  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -224,10 +420,40 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
  // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame.
  int mode_idx[MB_MODE_COUNT] = {0};
-  INTERP_FILTER filter_ref = SWITCHABLE;
+  INTERP_FILTER filter_ref = cm->interp_filter;
  int bsl = mi_width_log2_lookup[bsize];
-  const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
-                                      get_chessboard_index(cm)) % 2;
+  const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
+      (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0;
+  int const_motion[MAX_REF_FRAMES] = { 0 };
+
+  // For speed 6, the result of interp filter is reused later in actual encoding
+  // process.
+  int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+  int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  int pixels_in_block = bh * bw;
+  // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
+  PRED_BUFFER tmp[4];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+  struct buf_2d orig_dst = pd->dst;
+  PRED_BUFFER *best_pred = NULL;
+  PRED_BUFFER *this_mode_pred = NULL;
+  int i;
+
+#if CONFIG_DENOISING
+  vp9_denoiser_reset_frame_stats(&cpi->denoiser);
+#endif
+
+  if (cpi->sf.reuse_inter_pred_sby) {
+    for (i = 0; i < 3; i++) {
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+      tmp[i].stride = bw;
+      tmp[i].in_use = 0;
+    }
+
+    tmp[3].data = pd->dst.buf;
+    tmp[3].stride = pd->dst.stride;
+    tmp[3].in_use = 0;
+  }

  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;

@@ -241,18 +467,36 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  mbmi->ref_frame[0] = NONE;
  mbmi->ref_frame[1] = NONE;
  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
-                      tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ?
-                        EIGHTTAP : cpi->common.interp_filter;
+                      tx_mode_to_biggest_tx_size[cm->tx_mode]);
+  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ?
+                        EIGHTTAP : cm->interp_filter;
  mbmi->skip = 0;
  mbmi->segment_id = segment_id;

  for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
    x->pred_mv_sad[ref_frame] = INT_MAX;
    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      vp9_setup_buffer_inter(cpi, x, tile,
-                             ref_frame, bsize, mi_row, mi_col,
-                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+      int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                           sf, sf);
+
+      if (cm->coding_use_prev_mi)
+        vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
+                         candidates, mi_row, mi_col);
+      else
+        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0],
+                                             ref_frame, candidates,
+                                             mi_row, mi_col);
+
+      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                            &frame_mv[NEARESTMV][ref_frame],
+                            &frame_mv[NEARMV][ref_frame]);
+
+      if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8)
+        vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                    ref_frame, bsize);
    }
    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
    frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -286,6 +530,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
      int rate_mv = 0;

+      if (const_motion[ref_frame] &&
+          (this_mode == NEARMV || this_mode == ZEROMV))
+        continue;
+
      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
        continue;

@@ -324,6 +572,16 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      // Search for the best prediction filter type, when the resulting
      // motion vector is at sub-pixel accuracy level for luma component, i.e.,
      // the last three bits are all zeros.
+      if (cpi->sf.reuse_inter_pred_sby) {
+        if (this_mode == NEARESTMV) {
+          this_mode_pred = &tmp[3];
+        } else {
+          this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+          pd->dst.buf = this_mode_pred->data;
+          pd->dst.stride = bw;
+        }
+      }
+
      if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
          pred_filter_search &&
          ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
@@ -332,8 +590,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        int64_t pf_dist[3];
        unsigned int pf_var[3];
        unsigned int pf_sse[3];
+        TX_SIZE pf_tx_size[3];
        int64_t best_cost = INT64_MAX;
        INTERP_FILTER best_filter = SWITCHABLE, filter;
+        PRED_BUFFER *current_pred = this_mode_pred;

        for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
          int64_t cost;
@@ -344,14 +604,32 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          cost = RDCOST(x->rdmult, x->rddiv,
                        vp9_get_switchable_rate(cpi) + pf_rate[filter],
                        pf_dist[filter]);
+          pf_tx_size[filter] = mbmi->tx_size;
          if (cost < best_cost) {
-              best_filter = filter;
-              best_cost = cost;
-              skip_txfm = x->skip_txfm;
+            best_filter = filter;
+            best_cost = cost;
+            skip_txfm = x->skip_txfm;
+
+            if (cpi->sf.reuse_inter_pred_sby) {
+              if (this_mode_pred != current_pred) {
+                free_pred_buffer(this_mode_pred);
+                this_mode_pred = current_pred;
+              }
+
+              if (filter < EIGHTTAP_SHARP) {
+                current_pred = &tmp[get_pred_buffer(tmp, 3)];
+                pd->dst.buf = current_pred->data;
+                pd->dst.stride = bw;
+              }
+            }
          }
        }

+        if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
+          free_pred_buffer(current_pred);
+
        mbmi->interp_filter = best_filter;
+        mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
        rate = pf_rate[mbmi->interp_filter];
        dist = pf_dist[mbmi->interp_filter];
        var_y = pf_var[mbmi->interp_filter];
@@ -370,77 +648,17 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

      // Skipping checking: test to see if this block can be reconstructed by
      // prediction only.
-      if (!x->in_active_map) {
-        x->skip = 1;
-      } else if (cpi->allow_encode_breakout && x->encode_breakout) {
-        const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
-        unsigned int var = var_y, sse = sse_y;
-        // Skipping threshold for ac.
-        unsigned int thresh_ac;
-        // Skipping threshold for dc.
-        unsigned int thresh_dc;
-        // Set a maximum for threshold to avoid big PSNR loss in low bit rate
-        // case. Use extreme low threshold for static frames to limit skipping.
-        const unsigned int max_thresh = 36000;
-        // The encode_breakout input
-        const unsigned int min_thresh =
-            MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
-
-        // Calculate threshold according to dequant value.
-        thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-        thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
-
-        // Adjust ac threshold according to partition size.
-        thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
-            b_height_log2_lookup[bsize]);
-
-        thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
-
-        // Y skipping condition checking for ac and dc.
-        if (var <= thresh_ac && (sse - var) <= thresh_dc) {
-          unsigned int sse_u, sse_v;
-          unsigned int var_u, var_v;
-
-          // Skip u v prediction for less calculation, that won't affect
-          // result much.
-          var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
-                                          x->plane[1].src.stride,
-                                          xd->plane[1].dst.buf,
-                                          xd->plane[1].dst.stride, &sse_u);
-
-          // U skipping condition checking
-          if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
-            var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
-                                            x->plane[2].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[2].dst.stride, &sse_v);
-
-            // V skipping condition checking
-            if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
-              x->skip = 1;
-
-              // The cost of skip bit needs to be added.
-              rate = rate_mv;
-              rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                           [INTER_OFFSET(this_mode)];
-
-              // More on this part of rate
-              // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-
-              // Scaling factor for SSE from spatial domain to frequency
-              // domain is 16. Adjust distortion accordingly.
-              // TODO(yunqingwang): In this function, only y-plane dist is
-              // calculated.
-              dist = (sse << 4);  // + ((sse_u + sse_v) << 4);
-              this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
-              // *disable_skip = 1;
-            }
-          }
+      if (cpi->allow_encode_breakout) {
+        encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
+                             this_mode, var_y, sse_y, yv12_mb, &rate, &dist);
+        if (x->skip) {
+          rate += rate_mv;
+          this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
        }
      }

 #if CONFIG_DENOISING
-    vp9_denoiser_update_frame_stats();
+      vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y, this_mode);
 #endif

      if (this_rd < best_rd || x->skip) {
@@ -449,8 +667,19 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        *returndistortion = dist;
        best_mode = this_mode;
        best_pred_filter = mbmi->interp_filter;
+        best_tx_size = mbmi->tx_size;
        best_ref_frame = ref_frame;
        skip_txfm = x->skip_txfm;
+
+        if (cpi->sf.reuse_inter_pred_sby) {
+          if (best_pred != NULL)
+            free_pred_buffer(best_pred);
+
+          best_pred = this_mode_pred;
+        }
+      } else {
+        if (cpi->sf.reuse_inter_pred_sby)
+          free_pred_buffer(this_mode_pred);
      }

      if (x->skip)
@@ -458,34 +687,83 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    }
  }

+  // If best prediction is not in dst buf, then copy the prediction block from
+  // temp buf to dst buf.
+  if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
+    uint8_t *copy_from, *copy_to;

-  mbmi->mode = best_mode;
+    pd->dst = orig_dst;
+    copy_to = pd->dst.buf;
+
+    copy_from = best_pred->data;
+
+    vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
+                      bw, bh);
+  }
+
+  mbmi->mode          = best_mode;
  mbmi->interp_filter = best_pred_filter;
-  mbmi->ref_frame[0] = best_ref_frame;
-  mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  mbmi->tx_size       = best_tx_size;
+  mbmi->ref_frame[0]  = best_ref_frame;
+  mbmi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
  xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
  x->skip_txfm = skip_txfm;

  // Perform intra prediction search, if the best SAD is above a certain
  // threshold.
  if (!x->skip && best_rd > inter_mode_thresh &&
-      bsize < cpi->sf.max_intra_bsize) {
-    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
-                              mbmi->tx_size, this_mode,
-                              &p->src.buf[0], p->src.stride,
-                              &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
+      bsize <= cpi->sf.max_intra_bsize) {
+    int i, j;
+    const int width  = num_4x4_blocks_wide_lookup[bsize];
+    const int height = num_4x4_blocks_high_lookup[bsize];
+
+    int rate2 = 0;
+    int64_t dist2 = 0;
+    const int dst_stride = pd->dst.stride;
+    const int src_stride = p->src.stride;
+    int block_idx = 0;
+
+    TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
+                              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    const int step = 1 << tmp_tx_size;
+
+    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      if (cpi->sf.reuse_inter_pred_sby) {
+        pd->dst.buf = tmp[0].data;
+        pd->dst.stride = bw;
+      }
+
+      for (j = 0; j < height; j += step) {
+        for (i = 0; i < width; i += step) {
+          vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
+                                  tmp_tx_size, this_mode,
+                                  &p->src.buf[4 * (j * dst_stride + i)],
+                                  src_stride,
+                                  &pd->dst.buf[4 * (j * dst_stride + i)],
+                                  dst_stride, i, j, 0);
+          model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+          rate2 += rate;
+          dist2 += dist;
+          ++block_idx;
+        }
+      }
+
+      rate = rate2;
+      dist = dist2;

-      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
      rate += cpi->mbmode_cost[this_mode];
      rate += intra_cost_penalty;
      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);

+      if (cpi->sf.reuse_inter_pred_sby)
+        pd->dst = orig_dst;
+
      if (this_rd + intra_mode_cost < best_rd) {
        best_rd = this_rd;
        *returnrate = rate;
        *returndistortion = dist;
        mbmi->mode = this_mode;
+        mbmi->tx_size = tmp_tx_size;
        mbmi->ref_frame[0] = INTRA_FRAME;
        mbmi->uv_mode = this_mode;
        mbmi->mv[0].as_int = INVALID_MV;
@@ -494,9 +772,9 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      }
    }
  }
+
 #if CONFIG_DENOISING
-  vp9_denoiser_denoise(&cpi->denoiser, x, cpi->common.mi_grid_visible, mi_row,
-                       mi_col, bsize);
+  vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, bsize);
 #endif

  return INT64_MAX;
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -17,6 +17,12 @@
 extern "C" {
 #endif

+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                            const struct TileInfo *const tile,
                            int mi_row, int mi_col,
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -42,9 +42,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
 }

 void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                           const int16_t *round_ptr, const int16_t quant,
+                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
  int eob = -1;

  if (!skip_block) {
@@ -63,6 +63,47 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
  *eob_ptr = eob + 1;
 }

+void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
+                       int skip_block,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr,
+                       int zbin_oq_value, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                      int skip_block,
                      const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -207,11 +248,16 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    const int qrounding_factor = q == 0 ? 64 : 48;

    for (i = 0; i < 2; ++i) {
+      int qrounding_factor_fp = i == 0 ? 48 : 42;
+      if (q == 0)
+        qrounding_factor_fp = 64;
+
      // y
      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
                     : vp9_ac_quant(q, 0);
      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
      quants->y_quant_fp[q][i] = (1 << 16) / quant;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
      cm->y_dequant[q][i] = quant;
@@ -222,6 +268,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
      invert_quant(&quants->uv_quant[q][i],
                   &quants->uv_quant_shift[q][i], quant);
      quants->uv_quant_fp[q][i] = (1 << 16) / quant;
+      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
      cm->uv_dequant[q][i] = quant;
@@ -240,6 +287,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    for (i = 2; i < 8; i++) {
      quants->y_quant[q][i] = quants->y_quant[q][1];
      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
      quants->y_zbin[q][i] = quants->y_zbin[q][1];
      quants->y_round[q][i] = quants->y_round[q][1];
@@ -247,6 +295,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {

      quants->uv_quant[q][i] = quants->uv_quant[q][1];
      quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
+      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
      quants->uv_round[q][i] = quants->uv_round[q][1];
@@ -276,6 +325,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
  // Y
  x->plane[0].quant = quants->y_quant[qindex];
  x->plane[0].quant_fp = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
  x->plane[0].zbin = quants->y_zbin[qindex];
  x->plane[0].round = quants->y_round[qindex];
@@ -286,6 +336,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
  for (i = 1; i < 3; i++) {
    x->plane[i].quant = quants->uv_quant[qindex];
    x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
    x->plane[i].zbin = quants->uv_zbin[qindex];
    x->plane[i].round = quants->uv_round[qindex];
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -28,6 +28,8 @@ typedef struct {
  // if we want to deprecate the current use of y_quant.
  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
  DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);

  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -186,6 +186,8 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
 }

 void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  int i;
+
  if (pass == 0 && oxcf->rc_mode == VPX_CBR) {
    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
@@ -227,9 +229,9 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
  rc->tot_q = 0.0;
  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q);

-  rc->rate_correction_factor = 1.0;
-  rc->key_frame_rate_correction_factor = 1.0;
-  rc->gf_rate_correction_factor = 1.0;
+  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    rc->rate_correction_factors[i] = 1.0;
+  }
 }

 int vp9_rc_drop_frame(VP9_COMP *cpi) {
@@ -271,28 +273,40 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
 }

 static double get_rate_correction_factor(const VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+
  if (cpi->common.frame_type == KEY_FRAME) {
-    return cpi->rc.key_frame_rate_correction_factor;
+    return rc->rate_correction_factors[KF_STD];
+  } else if (cpi->pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    return rc->rate_correction_factors[rf_lvl];
  } else {
    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !cpi->rc.is_src_frame_alt_ref &&
+        !rc->is_src_frame_alt_ref &&
        !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
-      return cpi->rc.gf_rate_correction_factor;
+      return rc->rate_correction_factors[GF_ARF_STD];
    else
-      return cpi->rc.rate_correction_factor;
+      return rc->rate_correction_factors[INTER_NORMAL];
  }
 }

 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
  if (cpi->common.frame_type == KEY_FRAME) {
-    cpi->rc.key_frame_rate_correction_factor = factor;
+    rc->rate_correction_factors[KF_STD] = factor;
+  } else if (cpi->pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rc->rate_correction_factors[rf_lvl] = factor;
  } else {
    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !cpi->rc.is_src_frame_alt_ref &&
+        !rc->is_src_frame_alt_ref &&
        !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
-      cpi->rc.gf_rate_correction_factor = factor;
+      rc->rate_correction_factors[GF_ARF_STD] = factor;
    else
-      cpi->rc.rate_correction_factor = factor;
+      rc->rate_correction_factors[INTER_NORMAL] = factor;
  }
 }

@@ -628,8 +642,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,

  if (frame_is_intra_only(cm)) {
    active_best_quality = rc->best_quality;
-#if !CONFIG_MULTIPLE_ARF
-    // Handle the special case for key frames forced when we have75 reached
+
+    // Handle the special case for key frames forced when we have reached
    // the maximum key frame interval. Here force the Q to a range
    // based on the ambient Q to reduce the risk of popping.
    if (rc->this_key_frame_forced) {
@@ -660,13 +674,6 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
      active_best_quality += vp9_compute_qdelta(rc, q_val,
                                                q_val * q_adj_factor);
    }
-#else
-    double current_q;
-    // Force the KF quantizer to be 30% of the active_worst_quality.
-    current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    active_best_quality = active_worst_quality
-        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
-#endif
  } else if (!rc->is_src_frame_alt_ref &&
             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
    // Use the lower of active_worst_quality and recent
@@ -768,23 +775,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
        q = *top_index;
    }
  }
-#if CONFIG_MULTIPLE_ARF
-  // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      cpi->oxcf.rc_mode != VPX_Q) {
-    double new_q;
-    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    int level = cpi->this_frame_weight;
-    assert(level >= 0);
-    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
-    q = active_worst_quality +
-        vp9_compute_qdelta(rc, current_q, new_q);

-    *bottom_index = q;
-    *top_index    = q;
-    printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  }
-#endif
  assert(*top_index <= rc->worst_quality &&
         *top_index >= rc->best_quality);
  assert(*bottom_index <= rc->worst_quality &&
@@ -805,7 +796,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
  int q;

  if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
-#if !CONFIG_MULTIPLE_ARF
    // Handle the special case for key frames forced when we have75 reached
    // the maximum key frame interval. Here force the Q to a range
    // based on the ambient Q to reduce the risk of popping.
@@ -840,13 +830,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
      active_best_quality += vp9_compute_qdelta(rc, q_val,
                                                q_val * q_adj_factor);
    }
-#else
-    double current_q;
-    // Force the KF quantizer to be 30% of the active_worst_quality.
-    current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    active_best_quality = active_worst_quality
-        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
-#endif
  } else if (!rc->is_src_frame_alt_ref &&
             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
    // Use the lower of active_worst_quality and recent
@@ -909,21 +892,20 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
  *bottom_index = active_best_quality;

 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  vp9_clear_system_state();
  {
-    int qdelta = 0;
-    vp9_clear_system_state();
-
-    // Limit Q range for the adaptive loop.
-    if ((cm->frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi)) &&
-        !rc->this_key_frame_forced) {
-      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 2.0);
-    } else if (!rc->is_src_frame_alt_ref &&
-               (oxcf->rc_mode != VPX_CBR) &&
-               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 1.75);
-    }
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+      1.00,  // INTER_NORMAL
+      1.00,  // INTER_HIGH
+      1.50,  // GF_ARF_LOW
+      1.75,  // GF_ARF_STD
+      2.00,  // KF_STD
+    };
+    const double rate_factor =
+      rate_factor_deltas[gf_group->rf_level[gf_group->index]];
+    int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                            active_worst_quality, rate_factor);
    *top_index = active_worst_quality + qdelta;
    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
  }
@@ -945,23 +927,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
        q = *top_index;
    }
  }
-#if CONFIG_MULTIPLE_ARF
-  // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      cpi->oxcf.rc_mode != VPX_Q) {
-    double new_q;
-    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    int level = cpi->this_frame_weight;
-    assert(level >= 0);
-    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
-    q = active_worst_quality +
-        vp9_compute_qdelta(rc, current_q, new_q);

-    *bottom_index = q;
-    *top_index    = q;
-    printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  }
-#endif
  assert(*top_index <= rc->worst_quality &&
         *top_index >= rc->best_quality);
  assert(*bottom_index <= rc->worst_quality &&
@@ -1026,11 +992,8 @@ static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
  RATE_CONTROL *const rc = &cpi->rc;
  rc->frames_since_golden = 0;

-#if CONFIG_MULTIPLE_ARF
-  if (!cpi->multi_arf_enabled)
-#endif
-    // Clear the alternate reference update pending flag.
-    rc->source_alt_ref_pending = 0;
+  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+  rc->source_alt_ref_pending = 0;

  // Set the alternate reference frame active flag
  rc->source_alt_ref_active = 1;
@@ -1044,8 +1007,13 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
    // this frame refreshes means next frames don't unless specified by user
    rc->frames_since_golden = 0;

-    if (!rc->source_alt_ref_pending)
+    if (cpi->pass == 2) {
+      if (!rc->source_alt_ref_pending &&
+          cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD)
      rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
+      rc->source_alt_ref_active = 0;
+    }

    // Decrement count down till next gf
    if (rc->frames_till_gf_update_due > 0)
@@ -1388,6 +1356,8 @@ void vp9_rc_set_gf_max_interval(const VP9EncoderConfig *const oxcf,

  // Extended interval for genuinely static scenes
  rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
+  if (rc->static_scene_max_gf_interval > (MAX_LAG_BUFFERS * 2))
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;

  if (is_altref_enabled(oxcf)) {
    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -23,6 +23,15 @@ extern "C" {
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9

+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_HIGH = 1,
+  GF_ARF_LOW = 2,
+  GF_ARF_STD = 3,
+  KF_STD = 4,
+  RATE_FACTOR_LEVELS = 5
+} RATE_FACTOR_LEVEL;
+
 typedef struct {
  // Rate targetting variables
  int base_frame_target;           // A baseline frame target before adjustment
@@ -37,9 +46,7 @@ typedef struct {
  int last_boost;
  int kf_boost;

-  double rate_correction_factor;
-  double key_frame_rate_correction_factor;
-  double gf_rate_correction_factor;
+  double rate_correction_factors[RATE_FACTOR_LEVELS];

  int frames_since_golden;
  int frames_till_gf_update_due;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -595,7 +595,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
  int c, cost;
  // Check for consistency of tx_size with mode info
  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
-                              : get_uv_tx_size(mbmi) == tx_size);
+                              : get_uv_tx_size(mbmi, pd) == tx_size);

  if (eob == 0) {
    // single eob token
@@ -1277,9 +1277,6 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
    MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
    MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;

-    if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
-      continue;
-
    if (cpi->common.frame_type == KEY_FRAME) {
      const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
      const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
@@ -1330,7 +1327,7 @@ static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
                             int64_t ref_best_rd) {
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+  const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
  int plane;
  int pnrate = 0, pnskip = 1;
  int64_t pndist = 0, pnsse = 0;
@@ -1450,16 +1447,8 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,

 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
                       int mode_context) {
-  const MACROBLOCK *const x = &cpi->mb;
-  const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id;
-
-  // Don't account for mode here if segment skip is enabled.
-  if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
-    assert(is_inter_mode(mode));
-    return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
-  } else {
-    return 0;
-  }
+  assert(is_inter_mode(mode));
+  return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
 }

 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2077,9 +2066,9 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
  return bsi->segment_rd;
 }

-static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
-                    uint8_t *ref_y_buffer, int ref_y_stride,
-                    int ref_frame, BLOCK_SIZE block_size ) {
+void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size) {
  MACROBLOCKD *xd = &x->e_mbd;
  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
  int_mv this_mv;
@@ -2218,12 +2207,12 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }

-static void setup_pred_block(const MACROBLOCKD *xd,
-                             struct buf_2d dst[MAX_MB_PLANE],
-                             const YV12_BUFFER_CONFIG *src,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *scale,
-                             const struct scale_factors *scale_uv) {
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv) {
  int i;

  dst[0].buf = src->y_buffer;
@@ -2261,7 +2250,7 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,

  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
  // use the UV scaling factors.
-  setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+  vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);

  // Gets an initial list of candidate vectors from neighbours and orders them
  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
@@ -2275,16 +2264,17 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
  // in full and choose the best as the centre point for subsequent searches.
  // The current implementation doesn't support scaling.
  if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
-    mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
-            ref_frame, block_size);
+    vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                ref_frame, block_size);
 }

 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
                                                   int ref_frame) {
  const VP9_COMMON *const cm = &cpi->common;
+  const RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
-  return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
+  return (scaled_idx != ref_idx) ? &frame_bufs[scaled_idx].buf : NULL;
 }

 int vp9_get_switchable_rate(const VP9_COMP *cpi) {
@@ -2802,13 +2792,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    *rate2 += vp9_get_switchable_rate(cpi);

  if (!is_comp_pred) {
-    if (!x->in_active_map ||
-        vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      if (psse)
-        *psse = 0;
-      *distortion = 0;
-      x->skip = 1;
-    } else if (cpi->allow_encode_breakout && x->encode_breakout) {
+    if (cpi->allow_encode_breakout && x->encode_breakout) {
      const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
      const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
      unsigned int var, sse;
@@ -2928,6 +2912,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = xd->plane;
  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
  int y_skip = 0, uv_skip = 0;
  int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
@@ -2943,7 +2928,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      *returnrate = INT_MAX;
      return;
    }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
+                                         pd[1].subsampling_x,
+                                         pd[1].subsampling_y);
    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
                            &dist_uv, &uv_skip, bsize, max_uv_tx_size);
  } else {
@@ -2953,7 +2940,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      *returnrate = INT_MAX;
      return;
    }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
+                                         pd[1].subsampling_x,
+                                         pd[1].subsampling_y);
    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
                            &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
  }
@@ -3015,6 +3004,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
  const struct segmentation *const seg = &cm->seg;
+  struct macroblockd_plane *const pd = xd->plane;
  PREDICTION_MODE this_mode;
  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
  unsigned char segment_id = mbmi->segment_id;
@@ -3117,13 +3107,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    }
  }

-  // If the segment skip feature is enabled....
-  // then do nothing if the current mode is not allowed..
-  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
-    mode_skip_mask = ~(1 << THR_ZEROMV);
-    inter_mode_mask = (1 << ZEROMV);
-  }
-
  // Disable this drop out case if the ref frame
  // segment level feature is enabled for this segment. This is to
  // prevent the possibility that we end up unable to pick any mode.
@@ -3162,21 +3145,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    mode_skip_mask |= all_intra_modes;
  }

-  if (!x->in_active_map) {
-    int mode_index;
-    assert(cpi->ref_frame_flags & VP9_LAST_FLAG);
-    if (frame_mv[NEARESTMV][LAST_FRAME].as_int == 0)
-      mode_index = THR_NEARESTMV;
-    else if (frame_mv[NEARMV][LAST_FRAME].as_int == 0)
-      mode_index = THR_NEARMV;
-    else
-      mode_index = THR_ZEROMV;
-    mode_skip_mask = ~(1 << mode_index);
-    mode_skip_start = MAX_MODES;
-    inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
-                      (1 << NEWMV);
-  }
-
  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
    int mode_excluded = 0;
    int64_t this_rd = INT64_MAX;
@@ -3266,17 +3234,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        }
      }
    } else {
-      if (x->in_active_map &&
-          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-        const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
-        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                inter_mode_mask, this_mode, ref_frames))
-          continue;
-      }
+      const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
+      if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
+                              inter_mode_mask, this_mode, ref_frames))
+        continue;
    }

    mbmi->mode = this_mode;
-    mbmi->uv_mode = x->in_active_map ? DC_PRED : this_mode;
+    mbmi->uv_mode = DC_PRED;
    mbmi->ref_frame[0] = ref_frame;
    mbmi->ref_frame[1] = second_ref_frame;
    // Evaluate all sub-pel filters irrespective of whether we can use
@@ -3304,7 +3269,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      if (rate_y == INT_MAX)
        continue;

-      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize);
+      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd[1].subsampling_x,
+                                  pd[1].subsampling_y);
      if (rate_uv_intra[uv_tx] == INT_MAX) {
        choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
                             &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
@@ -3348,31 +3314,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    }

    if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
-                                                         SEG_LVL_SKIP);
-
      if (skippable) {
+        vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+
        // Back out the coefficient coding costs
        rate2 -= (rate_y + rate_uv);
        // for best yrd calculation
        rate_uv = 0;

-        if (mb_skip_allowed) {
-          int prob_skip_cost;
-
-          // Cost the skip mb case
-          vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
-          if (skip_prob) {
-            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-            rate2 += prob_skip_cost;
-          }
+        // Cost the skip mb case
+        if (skip_prob) {
+          int prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+          rate2 += prob_skip_cost;
        }
-      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+      } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
          // Add in the cost of the no skip flag.
@@ -3387,7 +3342,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          rate_uv = 0;
          this_skip2 = 1;
        }
-      } else if (mb_skip_allowed) {
+      } else {
        // Add in the cost of the no skip flag.
        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
      }
@@ -3549,7 +3504,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
      TX_SIZE uv_tx_size;
      *mbmi = best_mbmode;
-      uv_tx_size = get_uv_tx_size(mbmi);
+      uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                              &rate_uv_tokenonly[uv_tx_size],
                              &dist_uv[uv_tx_size],
@@ -3596,16 +3551,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    vp9_zero(best_tx_diff);
  }

-  if (!x->in_active_map) {
-    assert(mbmi->ref_frame[0] == LAST_FRAME);
-    assert(mbmi->ref_frame[1] == NONE);
-    assert(mbmi->mode == NEARESTMV ||
-           mbmi->mode == NEARMV ||
-           mbmi->mode == ZEROMV);
-    assert(frame_mv[mbmi->mode][LAST_FRAME].as_int == 0);
-    assert(mbmi->mode == mbmi->uv_mode);
-  }
-
  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
  store_coding_context(x, ctx, best_mode_index,
                       best_pred_diff, best_tx_diff, best_filter_diff);
@@ -3613,6 +3558,111 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  return best_rd;
 }

+int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int *returnrate,
+                                           int64_t *returndistortion,
+                                           BLOCK_SIZE bsize,
+                                           PICK_MODE_CONTEXT *ctx,
+                                           int64_t best_rd_so_far) {
+  VP9_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+  unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  int64_t best_tx_diff[TX_MODES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vp9_prob comp_mode_p;
+  INTERP_FILTER best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
+    x->pred_mv_sad[i] = INT_MAX;
+
+  *returnrate = INT_MAX;
+
+  assert(vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP));
+
+  mbmi->mode = ZEROMV;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->mv[0].as_int = 0;
+  x->skip = 1;
+
+  // Search for best switchable filter by checking the variance of
+  // pred error irrespective of whether the filter will be used
+  rd_opt->mask_filter = 0;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    rd_opt->filter_cache[i] = INT64_MAX;
+
+  if (cm->interp_filter != BILINEAR) {
+    best_filter = EIGHTTAP;
+    if (cm->interp_filter == SWITCHABLE &&
+        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        mbmi->interp_filter = i;
+        rs = vp9_get_switchable_rate(cpi);
+        if (rs < best_rs) {
+          best_rs = rs;
+          best_filter = mbmi->interp_filter;
+        }
+      }
+    }
+  }
+  // Set the appropriate filter
+  if (cm->interp_filter == SWITCHABLE) {
+    mbmi->interp_filter = best_filter;
+    rate2 += vp9_get_switchable_rate(cpi);
+  } else {
+    mbmi->interp_filter = cm->interp_filter;
+  }
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += vp9_cost_bit(comp_mode_p, comp_pred);
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+  *returnrate = rate2;
+  *returndistortion = distortion2;
+
+  if (this_rd >= best_rd_so_far)
+    return INT64_MAX;
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mbmi->interp_filter));
+
+  update_rd_thresh_fact(cpi, bsize, THR_ZEROMV);
+
+  vp9_zero(best_pred_diff);
+  vp9_zero(best_filter_diff);
+  vp9_zero(best_tx_diff);
+
+  if (!x->select_tx_size)
+    swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
+  store_coding_context(x, ctx, THR_ZEROMV,
+                       best_pred_diff, best_tx_diff, best_filter_diff);
+
+  return this_rd;
+}

 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                      const TileInfo *const tile,
@@ -4224,6 +4274,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
  int i;
  RD_OPT *const rd = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;

  // Set baseline threshold values
  for (i = 0; i < MAX_MODES; ++i)
@@ -4301,6 +4352,10 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
    rd->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
    rd->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
  }
+
+  // Adjust threshold only in real time mode, which only use last reference
+  // frame.
+  rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
 }

 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -13,7 +13,10 @@

 #include <limits.h>

-#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/common/vp9_blockd.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_context_tree.h"

 #ifdef __cplusplus
 extern "C" {
@@ -30,21 +33,104 @@ extern "C" {

 #define INVALID_MV 0x80008000

+#define MAX_MODES 30
+#define MAX_REFS  6
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+  THR_NEARESTMV,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_DC,
+
+  THR_NEWMV,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARA,
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARESTGA,
+
+  THR_TM,
+
+  THR_COMP_NEARLA,
+  THR_COMP_NEWLA,
+  THR_NEARG,
+  THR_COMP_NEARGA,
+  THR_COMP_NEWGA,
+
+  THR_ZEROMV,
+  THR_ZEROG,
+  THR_ZEROA,
+  THR_COMP_ZEROLA,
+  THR_COMP_ZEROGA,
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D207_PRED,
+  THR_D153_PRED,
+  THR_D63_PRED,
+  THR_D117_PRED,
+  THR_D45_PRED,
+} THR_MODES;
+
+typedef enum {
+  THR_LAST,
+  THR_GOLD,
+  THR_ALTR,
+  THR_COMP_LA,
+  THR_COMP_GA,
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
+  int64_t tx_select_diff[TX_MODES];
+  // FIXME(rbultje) can this overflow?
+  int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
+
+  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t mask_filter;
+
+  int RDMULT;
+  int RDDIV;
+} RD_OPT;
+
+
 struct TileInfo;
+struct VP9_COMP;
+struct macroblock;

-int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex);
+int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);

-void vp9_initialize_rd_consts(VP9_COMP *cpi);
+void vp9_initialize_rd_consts(struct VP9_COMP *cpi);

-void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
+void vp9_initialize_me_consts(struct VP9_COMP *cpi, int qindex);

 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                  unsigned int qstep, int *rate,
                                  int64_t *dist);

-int vp9_get_switchable_rate(const VP9_COMP *cpi);
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi);

-void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+void vp9_setup_buffer_inter(struct VP9_COMP *cpi, struct macroblock *x,
                            const TileInfo *const tile,
                            MV_REFERENCE_FRAME ref_frame,
                            BLOCK_SIZE block_size,
@@ -53,14 +139,14 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv frame_near_mv[MAX_REF_FRAMES],
                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]);

-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
                                                   int ref_frame);

-void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                               int *r, int64_t *d, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);

-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                  const struct TileInfo *const tile,
                                  int mi_row, int mi_col,
                                  int *returnrate,
@@ -69,7 +155,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                  PICK_MODE_CONTEXT *ctx,
                                  int64_t best_rd_so_far);

-int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+int64_t vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+                                           struct macroblock *x,
+                                           int *returnrate,
+                                           int64_t *returndistortion,
+                                           BLOCK_SIZE bsize,
+                                           PICK_MODE_CONTEXT *ctx,
+                                           int64_t best_rd_so_far);
+
+int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+                                      struct macroblock *x,
                                      const struct TileInfo *const tile,
                                      int mi_row, int mi_col,
                                      int *returnrate,
@@ -85,15 +180,25 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                              ENTROPY_CONTEXT t_above[16],
                              ENTROPY_CONTEXT t_left[16]);

-void vp9_set_rd_speed_thresholds(VP9_COMP *cpi);
+void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);

-void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi);
+void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);

 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                      int thresh_fact) {
    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
 }

+void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size);
+
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -27,6 +27,8 @@ void vp9_enable_segmentation(struct segmentation *seg) {

 void vp9_disable_segmentation(struct segmentation *seg) {
  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
 }

 void vp9_set_segment_data(struct segmentation *seg,
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -84,16 +84,17 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,

  if (speed >= 2) {
    if (MIN(cm->width, cm->height) >= 720) {
-      sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+      sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
      sf->last_partitioning_redo_frequency = 3;
      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                              : DISABLE_ALL_INTER_SPLIT;
    } else {
      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
      sf->last_partitioning_redo_frequency = 2;
-      sf->lf_motion_threshold = NO_MOITION_THRESHOLD;
+      sf->lf_motion_threshold = NO_MOTION_THRESHOLD;
    }
-    sf->adaptive_pred_interp_filter = 2;
+
+    sf->adaptive_pred_interp_filter = 0;
    sf->reference_masking = 1;
    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                 FLAG_SKIP_INTRA_BESTINTER |
@@ -114,7 +115,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
    else
      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;

-    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
    sf->last_partitioning_redo_frequency = 3;
    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
    sf->adaptive_rd_thresh = 3;
@@ -198,7 +199,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
    sf->adjust_partitioning_from_last_frame = 1;
    sf->last_partitioning_redo_frequency = 3;
    sf->use_lp32x32fdct = 1;
@@ -249,6 +250,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
  }

  if (speed >= 5) {
+    sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
+        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
    sf->max_partition_size = BLOCK_32X32;
    sf->min_partition_size = BLOCK_8X8;
    sf->partition_check =
@@ -267,12 +270,25 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
    // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
    sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
    sf->search_type_check_frequency = 50;
-    sf->source_var_thresh = 360;

-    sf->tx_size_search_method = USE_TX_8X8;
+    sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
+        USE_LARGESTALL : USE_TX_8X8;
+    sf->max_intra_bsize = BLOCK_8X8;
+
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
+
+    // Increase mode checking threshold for NEWMV.
+    sf->elevate_newmv_thresh = 2000;
  }
-
  if (speed >= 7) {
+    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
+    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
+        800 : 300;
+    sf->elevate_newmv_thresh = 2500;
+  }
+  if (speed >= 8) {
    int i;
    for (i = 0; i < BLOCK_SIZES; ++i)
      sf->inter_mode_mask[i] = INTER_NEAREST;
@@ -303,6 +319,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  sf->use_lp32x32fdct = 0;
  sf->adaptive_motion_search = 0;
  sf->adaptive_pred_interp_filter = 0;
+  sf->use_quant_fp = 0;
  sf->reference_masking = 0;
  sf->partition_search_type = SEARCH_PARTITION;
  sf->less_rectangular_check = 0;
@@ -335,11 +352,13 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  for (i = 0; i < BLOCK_SIZES; ++i)
    sf->inter_mode_mask[i] = INTER_ALL;
  sf->max_intra_bsize = BLOCK_64X64;
+  sf->reuse_inter_pred_sby = 0;
  // This setting only takes effect when partition_search_type is set
  // to FIXED_PARTITION.
  sf->always_this_block_size = BLOCK_16X16;
  sf->search_type_check_frequency = 50;
-  sf->source_var_thresh = 100;
+  sf->encode_breakout_thresh = 0;
+  sf->elevate_newmv_thresh = 0;

  // Recode loop tolerence %.
  sf->recode_tolerance = 25;
@@ -383,4 +402,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  if (!cpi->oxcf.frame_periodic_boost) {
    sf->max_delta_qindex = 0;
  }
+
+  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout)
+    cpi->encode_breakout = sf->encode_breakout_thresh;
 }
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -44,8 +44,8 @@ typedef enum {
 } SUBPEL_SEARCH_METHODS;

 typedef enum {
-  NO_MOITION_THRESHOLD = 0,
-  LOW_MOITION_THRESHOLD = 7
+  NO_MOTION_THRESHOLD = 0,
+  LOW_MOTION_THRESHOLD = 7
 } MOTION_THRESHOLD;

 typedef enum {
@@ -73,6 +73,8 @@ typedef enum {
  LPF_PICK_FROM_SUBIMAGE,
  // Estimate the level based on quantizer and frame type
  LPF_PICK_FROM_Q,
+  // Pick 0 to disable LPF if LPF was enabled last frame
+  LPF_PICK_MINIMAL_LPF
 } LPF_PICK_METHOD;

 typedef enum {
@@ -282,6 +284,9 @@ typedef struct SPEED_FEATURES {
  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
  int adaptive_pred_interp_filter;

+  // Fast quantization process path
+  int use_quant_fp;
+
  // Search through variable block partition types in non-RD mode decision
  // encoding process for RTC.
  int partition_check;
@@ -351,8 +356,17 @@ typedef struct SPEED_FEATURES {
  // FIXED_PARTITION search type should be used.
  int search_type_check_frequency;

-  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
-  unsigned int source_var_thresh;
+  // When partition is pre-set, the inter prediction result from pick_inter_mode
+  // can be reused in final block encoding process. It is enabled only for real-
+  // time mode speed 6.
+  int reuse_inter_pred_sby;
+
+  // This variable sets the encode_breakout threshold. Currently, it is only
+  // enabled in real time mode.
+  int encode_breakout_thresh;
+
+  // In real time encoding, increase the threshold for NEWMV.
+  int elevate_newmv_thresh;
 } SPEED_FEATURES;

 struct VP9_COMP;
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -12,6 +12,7 @@

 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_extend.h"

 void vp9_init_layer_context(VP9_COMP *const cpi) {
  SVC *const svc = &cpi->svc;
@@ -31,6 +32,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
  for (layer = 0; layer < layer_end; ++layer) {
    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
    RATE_CONTROL *const lrc = &lc->rc;
+    int i;
    lc->current_video_frame_in_layer = 0;
    lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
    lrc->ni_av_qi = oxcf->worst_allowed_q;
@@ -42,8 +44,10 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
    lrc->ni_frames = 0;
    lrc->decimation_count = 0;
    lrc->decimation_factor = 0;
-    lrc->rate_correction_factor = 1.0;
-    lrc->key_frame_rate_correction_factor = 1.0;
+
+    for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+      lrc->rate_correction_factors[i] = 1.0;
+    }

    if (svc->number_temporal_layers > 1) {
      lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
@@ -206,3 +210,101 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
         cpi->svc.spatial_layer_id > 0 &&
         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
 }
+
+int vp9_svc_lookahead_push(const VP9_COMP *const cpi, struct lookahead_ctx *ctx,
+                           YV12_BUFFER_CONFIG *src, int64_t ts_start,
+                           int64_t ts_end, unsigned int flags) {
+  struct lookahead_entry *buf;
+  int i, index;
+
+  if (vp9_lookahead_push(ctx, src, ts_start, ts_end, flags))
+    return 1;
+
+  index = ctx->write_idx - 1;
+  if (index < 0)
+    index += ctx->max_sz;
+
+  buf = ctx->buf + index;
+
+  if (buf == NULL)
+    return 1;
+
+  // Store svc parameters for each layer
+  for (i = 0; i < cpi->svc.number_spatial_layers; ++i)
+    buf->svc_params[i] = cpi->svc.layer_context[i].svc_params_received;
+
+  return 0;
+}
+
+static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
+  int layer_id;
+  vpx_svc_parameters_t *layer_param;
+  vpx_enc_frame_flags_t flags;
+
+  // Find the next layer to be encoded
+  for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) {
+    if (buf->svc_params[layer_id].spatial_layer >=0)
+      break;
+  }
+
+  if (layer_id == cpi->svc.number_spatial_layers)
+    return 1;
+
+  layer_param = &buf->svc_params[layer_id];
+  buf->flags = flags = layer_param->flags;
+  cpi->svc.spatial_layer_id = layer_param->spatial_layer;
+  cpi->svc.temporal_layer_id = layer_param->temporal_layer;
+  cpi->lst_fb_idx = layer_param->lst_fb_idx;
+  cpi->gld_fb_idx = layer_param->gld_fb_idx;
+  cpi->alt_fb_idx = layer_param->alt_fb_idx;
+
+  if (vp9_set_size_literal(cpi, layer_param->width, layer_param->height) != 0)
+    return VPX_CODEC_INVALID_PARAM;
+
+  cpi->oxcf.worst_allowed_q =
+      vp9_quantizer_to_qindex(layer_param->max_quantizer);
+  cpi->oxcf.best_allowed_q =
+      vp9_quantizer_to_qindex(layer_param->min_quantizer);
+
+  vp9_change_config(cpi, &cpi->oxcf);
+
+  vp9_set_high_precision_mv(cpi, 1);
+
+  // Retrieve the encoding flags for each layer and apply it to encoder.
+  // It includes reference frame flags and update frame flags.
+  vp9_apply_encoding_flags(cpi, flags);
+
+  return 0;
+}
+
+struct lookahead_entry *vp9_svc_lookahead_peek(VP9_COMP *const cpi,
+                                               struct lookahead_ctx *ctx,
+                                               int index, int copy_params) {
+  struct lookahead_entry *buf = vp9_lookahead_peek(ctx, index);
+
+  if (buf != NULL && copy_params != 0) {
+    if (copy_svc_params(cpi, buf) != 0)
+      return NULL;
+  }
+  return buf;
+}
+
+struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
+                                              struct lookahead_ctx *ctx,
+                                              int drain) {
+  struct lookahead_entry *buf = NULL;
+
+  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+    buf = vp9_svc_lookahead_peek(cpi, ctx, 0, 1);
+    if (buf != NULL) {
+      // Only remove the buffer when pop the highest layer. Simply set the
+      // spatial_layer to -1 for lower layers.
+      buf->svc_params[cpi->svc.spatial_layer_id].spatial_layer = -1;
+      if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+        vp9_lookahead_pop(ctx, drain);
+      }
+    }
+  }
+
+  return buf;
+}
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -28,6 +28,7 @@ typedef struct {
  struct vpx_fixed_buf rc_twopass_stats_in;
  unsigned int current_video_frame_in_layer;
  int is_key_frame;
+  vpx_svc_parameters_t svc_params_received;
 } LAYER_CONTEXT;

 typedef struct {
@@ -74,6 +75,23 @@ void vp9_inc_frame_in_layer(SVC *svc);
 // Check if current layer is key frame in spatial upper layer
 int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);

+// Copy the source image, flags and svc parameters into a new framebuffer
+// with the expected stride/border
+int vp9_svc_lookahead_push(const struct VP9_COMP *const cpi,
+                           struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                           int64_t ts_start, int64_t ts_end,
+                           unsigned int flags);
+
+// Get the next source buffer to encode
+struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
+                                              struct lookahead_ctx *ctx,
+                                              int drain);
+
+// Get a future source buffer to encode
+struct lookahead_entry *vp9_svc_lookahead_peek(struct VP9_COMP *const cpi,
+                                               struct lookahead_ctx *ctx,
+                                               int index, int copy_params);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -217,3 +217,185 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 INIT_XMM ssse3
 QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+; TODO(jingning) to be continued with 32x32 quantization process
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+  pcmpeqw                        m12, m12
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+  pcmpeqw                        m12, m12
+%ifidn %1, b_32x32
+  pmovmskb                        r6, m7
+  pmovmskb                        r2, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -88,8 +88,10 @@ struct vpx_codec_alg_priv {
  size_t                  pending_frame_magnitude;
  vpx_image_t             preview_img;
  vp8_postproc_cfg_t      preview_ppcfg;
-  vpx_codec_pkt_list_decl(64) pkt_list;
-  unsigned int                fixed_kf_cntr;
+  vpx_codec_pkt_list_decl(128) pkt_list;
+  unsigned int                 fixed_kf_cntr;
+  // BufferPool that holds all reference frames.
+  BufferPool              *buffer_pool;
 };

 static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
@@ -630,6 +632,10 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
    ctx->priv->alg_priv = priv;
    ctx->priv->init_flags = ctx->init_flags;
    ctx->priv->enc.total_encoders = 1;
+    ctx->priv->alg_priv->buffer_pool =
+        (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+    if (ctx->priv->alg_priv->buffer_pool == NULL)
+      return VPX_CODEC_MEM_ERROR;

    if (ctx->config.enc) {
      // Update the reference to the config structure to an internal copy.
@@ -667,7 +673,8 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
      set_encoder_config(&ctx->priv->alg_priv->oxcf,
                         &ctx->priv->alg_priv->cfg,
                         &ctx->priv->alg_priv->extra_cfg);
-      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
+      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf,
+                                  ctx->priv->alg_priv->buffer_pool);
      if (cpi == NULL)
        res = VPX_CODEC_MEM_ERROR;
      else
@@ -681,6 +688,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
  free(ctx->cx_data);
  vp9_remove_compressor(ctx->cpi);
+  vpx_free(ctx->buffer_pool);
  free(ctx);
  return VPX_CODEC_OK;
 }
@@ -795,42 +803,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
    return VPX_CODEC_INVALID_PARAM;
  }

-  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF |
-               VP8_EFLAG_NO_REF_ARF)) {
-    int ref = 7;
-
-    if (flags & VP8_EFLAG_NO_REF_LAST)
-      ref ^= VP9_LAST_FLAG;
-
-    if (flags & VP8_EFLAG_NO_REF_GF)
-      ref ^= VP9_GOLD_FLAG;
-
-    if (flags & VP8_EFLAG_NO_REF_ARF)
-      ref ^= VP9_ALT_FLAG;
-
-    vp9_use_as_reference(ctx->cpi, ref);
-  }
-
-  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
-               VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF |
-               VP8_EFLAG_FORCE_ARF)) {
-    int upd = 7;
-
-    if (flags & VP8_EFLAG_NO_UPD_LAST)
-      upd ^= VP9_LAST_FLAG;
-
-    if (flags & VP8_EFLAG_NO_UPD_GF)
-      upd ^= VP9_GOLD_FLAG;
-
-    if (flags & VP8_EFLAG_NO_UPD_ARF)
-      upd ^= VP9_ALT_FLAG;
-
-    vp9_update_reference(ctx->cpi, upd);
-  }
-
-  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
-    vp9_update_entropy(ctx->cpi, 0);
-  }
+  vp9_apply_encoding_flags(ctx->cpi, flags);

  // Handle fixed keyframe intervals
  if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -843,7 +816,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,

  // Initialize the encoder instance on the first frame.
  if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
-    unsigned int lib_flags;
+    unsigned int lib_flags = 0;
    YV12_BUFFER_CONFIG sd;
    int64_t dst_time_stamp, dst_end_time_stamp;
    size_t size, cx_data_sz;
@@ -853,9 +826,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;

-    // Convert API flags to internal codec lib flags
-    lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
    /* vp9 use 10,000,000 ticks/second as time stamp */
    dst_time_stamp = (pts * 10000000 * ctx->cfg.g_timebase.num)
                     / ctx->cfg.g_timebase.den;
@@ -865,7 +835,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
    if (img != NULL) {
      res = image2yuvconfig(img, &sd);

-      if (vp9_receive_raw_frame(ctx->cpi, lib_flags,
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (vp9_receive_raw_frame(ctx->cpi, flags,
                                &sd, dst_time_stamp, dst_end_time_stamp)) {
        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
        res = update_error_state(ctx, &cpi->common.error);
@@ -874,7 +846,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,

    cx_data = ctx->cx_data;
    cx_data_sz = ctx->cx_data_sz;
-    lib_flags = 0;

    /* Any pending invisible frames? */
    if (ctx->pending_cx_data) {
@@ -902,7 +873,12 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
        VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;

        // Pack invisible frames with the next visible frame
-        if (cpi->common.show_frame == 0) {
+        if (cpi->common.show_frame == 0
+#ifdef CONFIG_SPATIAL_SVC
+            || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+                cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+#endif
+            ) {
          if (ctx->pending_cx_data == 0)
            ctx->pending_cx_data = cx_data;
          ctx->pending_cx_data_sz += size;
@@ -925,7 +901,12 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
          / ctx->cfg.g_timebase.num / 10000000);
        pkt.data.frame.flags = lib_flags << 16;

-        if (lib_flags & FRAMEFLAGS_KEY)
+        if (lib_flags & FRAMEFLAGS_KEY
+#ifdef CONFIG_SPATIAL_SVC
+            || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+                cpi->svc.layer_context[0].is_key_frame)
+#endif
+            )
          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;

        if (cpi->common.show_frame == 0) {
@@ -1165,24 +1146,19 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
  VP9_COMP *const cpi = ctx->cpi;
  vpx_svc_parameters_t *const params = va_arg(args, vpx_svc_parameters_t *);

-  if (params == NULL)
+  if (params == NULL || params->spatial_layer < 0 ||
+      params->spatial_layer >= cpi->svc.number_spatial_layers)
    return VPX_CODEC_INVALID_PARAM;

-  cpi->svc.spatial_layer_id = params->spatial_layer;
-  cpi->svc.temporal_layer_id = params->temporal_layer;
+  if (params->spatial_layer == 0) {
+    int i;
+    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+      cpi->svc.layer_context[i].svc_params_received.spatial_layer = -1;
+    }
+  }

-  cpi->lst_fb_idx = params->lst_fb_idx;
-  cpi->gld_fb_idx = params->gld_fb_idx;
-  cpi->alt_fb_idx = params->alt_fb_idx;
-
-  if (vp9_set_size_literal(ctx->cpi, params->width, params->height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
-
-  ctx->cfg.rc_max_quantizer = params->max_quantizer;
-  ctx->cfg.rc_min_quantizer = params->min_quantizer;
-
-  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-  vp9_change_config(ctx->cpi, &ctx->oxcf);
+  cpi->svc.layer_context[params->spatial_layer].svc_params_received =
+      *params;

  return VPX_CODEC_OK;
 }
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -32,7 +32,6 @@ struct vpx_codec_alg_priv {
  vpx_codec_priv_t        base;
  vpx_codec_dec_cfg_t     cfg;
  vp9_stream_info_t       si;
-  struct VP9Decoder *pbi;
  int                     postproc_cfg_set;
  vp8_postproc_cfg_t      postproc_cfg;
  vpx_decrypt_cb          decrypt_cb;
@@ -42,6 +41,14 @@ struct vpx_codec_alg_priv {
  int                     frame_parallel_decode;  // frame-based threading.
  int                     last_show_frame;  // Index of last output frame.

+  VP9Worker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_thread_id;
+  int                     next_output_thread_id;
+
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool              *buffer_pool;
+
  // External frame buffer info to save for VP9 common.
  void *ext_priv;  // Private data associated with the external frame buffers.
  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
@@ -85,11 +92,18 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
 }

 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->pbi) {
-    vp9_decoder_remove(ctx->pbi);
-    ctx->pbi = NULL;
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VP9Worker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp9_decoder_remove(worker_data->pbi);
+      vpx_free(worker_data);
+    }
  }

+  vpx_free(ctx->frame_workers);
+  vpx_free(ctx->buffer_pool);
  vpx_free(ctx);

  return VPX_CODEC_OK;
@@ -102,9 +116,6 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
                                                void *decrypt_state) {
  uint8_t clear_buffer[9];

-  if (data_sz <= 8)
-    return VPX_CODEC_UNSUP_BITSTREAM;
-
  if (data + data_sz <= data)
    return VPX_CODEC_INVALID_PARAM;

@@ -125,12 +136,16 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,

    if (frame_marker != VP9_FRAME_MARKER)
      return VPX_CODEC_UNSUP_BITSTREAM;
+
    if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM;

    if (vp9_rb_read_bit(&rb)) {  // show an existing frame
      return VPX_CODEC_OK;
    }

+    if (data_sz <= 8)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
    si->is_kf = !vp9_rb_read_bit(&rb);
    if (si->is_kf) {
      const int sRGB = 7;
@@ -187,32 +202,43 @@ static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
  return VPX_CODEC_OK;
 }

+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
                           const struct vpx_internal_error_info *error) {
  if (error->error_code)
-    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);

  return error->error_code;
 }

 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  VP9_COMMON *const cm = &ctx->pbi->common;
+  int i;

-  cm->new_fb_idx = -1;
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;

-  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-    cm->get_fb_cb = ctx->get_ext_fb_cb;
-    cm->release_fb_cb = ctx->release_ext_fb_cb;
-    cm->cb_priv = ctx->ext_priv;
-  } else {
-    cm->get_fb_cb = vp9_get_frame_buffer;
-    cm->release_fb_cb = vp9_release_frame_buffer;
+    cm->new_fb_idx = -1;
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
+    } else {
+      pool->get_fb_cb = vp9_get_frame_buffer;
+      pool->release_fb_cb = vp9_release_frame_buffer;

-    if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to initialize internal frame buffers");
+      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");

-    cm->cb_priv = &cm->int_frame_buffers;
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
  }
 }

@@ -231,15 +257,64 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
  flags->noise_level = ctx->postproc_cfg.noise_level;
 }

-static void init_decoder(vpx_codec_alg_priv_t *ctx) {
-  ctx->pbi = vp9_decoder_create();
-  if (ctx->pbi == NULL)
-    return;
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = worker_data->data;
+  (void)arg2;
+  worker_data->result = vp9_receive_compressed_data(worker_data->pbi,
+                                                    worker_data->data_size,
+                                                    &data);
+  worker_data->data_end = data;
+  return !worker_data->result;
+}
+
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

-  ctx->pbi->max_threads = ctx->cfg.threads;
-  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
-  ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
  ctx->last_show_frame = -1;
+  ctx->next_submit_thread_id = 0;
+  ctx->next_output_thread_id = 0;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+  ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL)
+    return VPX_CODEC_MEM_ERROR;
+
+  ctx->frame_workers = (VP9Worker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    worker_data = (FrameWorkerData *)worker->data1;
+    worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+    if (worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    worker_data->pbi->owner_frame_worker = worker;
+    worker_data->worker_id = i;
+    worker_data->frame_context_ready = 0;
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    worker->hook = (VP9WorkerHook)frame_worker_hook;
+  }

  // If postprocessing was enabled by the application and a
  // configuration has not been provided, default it.
@@ -248,14 +323,15 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) {
    set_default_ppflags(&ctx->postproc_cfg);

  init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
 }

 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                  const uint8_t **data, unsigned int data_sz,
                                  void *user_priv, int64_t deadline) {
  vp9_ppflags_t flags = {0};
-  VP9_COMMON *cm = NULL;
-
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  (void)deadline;

  // Determine the stream parameters. Note that we rely on peek_si to
@@ -272,22 +348,37 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
      return VPX_CODEC_ERROR;
  }

-  // Initialize the decoder instance on the first frame
-  if (ctx->pbi == NULL) {
-    init_decoder(ctx);
-    if (ctx->pbi == NULL)
-      return VPX_CODEC_ERROR;
+  // Initialize the decoder workers on the first frame
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
  }

-  // Set these even if already initialized.  The caller may have changed the
-  // decrypt config between frames.
-  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
-  ctx->pbi->decrypt_state = ctx->decrypt_state;
+  if (!ctx->frame_parallel_decode) {
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    worker_data->data = *data;
+    worker_data->data_size = data_sz;
+    worker_data->user_priv = user_priv;

-  cm = &ctx->pbi->common;
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    worker_data->pbi->decrypt_state = ctx->decrypt_state;

-  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data))
-    return update_error_state(ctx, &cm->error);
+    worker->had_error = 0;
+    winterface->execute(worker);
+
+    // Update data pointer after decode.
+    *data = worker_data->data_end;
+
+    if (worker->had_error)
+      return update_error_state(ctx, &worker_data->pbi->common.error);
+  } else {
+    // TODO(hkuang): Implement frame parallel decode.
+    return VPX_CODEC_INCAPABLE;
+  }

  if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
    set_ppflags(ctx, &flags);
@@ -306,10 +397,17 @@ static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
  return *data;
 }

-static void parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                   uint32_t sizes[8], int *count,
-                                   vpx_decrypt_cb decrypt_cb,
-                                   void *decrypt_state) {
+static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
+                                              size_t data_sz,
+                                              uint32_t sizes[8], int *count,
+                                              vpx_decrypt_cb decrypt_cb,
+                                              void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
  uint8_t marker;

  assert(data_sz);
@@ -321,56 +419,45 @@ static void parse_superframe_index(const uint8_t *data, size_t data_sz,
    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
    const size_t index_sz = 2 + mag * frames;

-    if (data_sz >= index_sz) {
-      uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
-                                    data + data_sz - index_sz);
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;

-      if (marker == marker2) {
-        // Found a valid superframe index.
-        uint32_t i, j;
-        const uint8_t *x = &data[data_sz - index_sz + 1];
+    {
+      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+                                          data + data_sz - index_sz);

-        // Frames has a maximum of 8 and mag has a maximum of 4.
-        uint8_t clear_buffer[32];
-        assert(sizeof(clear_buffer) >= frames * mag);
-        if (decrypt_cb) {
-          decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
-          x = clear_buffer;
-        }
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }

-        for (i = 0; i < frames; ++i) {
-          uint32_t this_sz = 0;
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];

-          for (j = 0; j < mag; ++j)
-            this_sz |= (*x++) << (j * 8);
-          sizes[i] = this_sz;
-        }
-
-        *count = frames;
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[32];
+      assert(sizeof(clear_buffer) >= frames * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+        x = clear_buffer;
      }
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
    }
  }
-}
-
-static vpx_codec_err_t decode_one_iter(vpx_codec_alg_priv_t *ctx,
-                                       const uint8_t **data_start_ptr,
-                                       const uint8_t *data_end,
-                                       uint32_t frame_size, void *user_priv,
-                                       long deadline) {
-  const vpx_codec_err_t res = decode_one(ctx, data_start_ptr, frame_size,
-                                         user_priv, deadline);
-  if (res != VPX_CODEC_OK)
-    return res;
-
-  // Account for suboptimal termination by the encoder.
-  while (*data_start_ptr < data_end) {
-    const uint8_t marker = read_marker(ctx->decrypt_cb, ctx->decrypt_state,
-                                       *data_start_ptr);
-    if (marker)
-      break;
-    (*data_start_ptr)++;
-  }
-
  return VPX_CODEC_OK;
 }

@@ -378,7 +465,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                      const uint8_t *data, unsigned int data_sz,
                                      void *user_priv, long deadline) {
  const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
+  const uint8_t * const data_end = data + data_sz;
  vpx_codec_err_t res;
  uint32_t frame_sizes[8];
  int frame_count;
@@ -386,32 +473,86 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
  if (data == NULL || data_sz == 0)
    return VPX_CODEC_INVALID_PARAM;

-  parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
-                         ctx->decrypt_cb, ctx->decrypt_state);
+  res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                               ctx->decrypt_cb, ctx->decrypt_state);
+  if (res != VPX_CODEC_OK)
+    return res;

-  if (frame_count > 0) {
-    int i;
+  if (ctx->frame_parallel_decode) {
+    // Decode in frame parallel mode. When decoding in this mode, the frame
+    // passed to the decoder must be either a normal frame or a superframe with
+    // superframe index so the decoder could get each frame's start position
+    // in the superframe.
+    if (frame_count > 0) {
+      int i;

-    for (i = 0; i < frame_count; ++i) {
-      const uint32_t frame_size = frame_sizes[i];
-      if (data_start < data ||
-          frame_size > (uint32_t)(data_end - data_start)) {
-        ctx->base.err_detail = "Invalid frame size in index";
-        return VPX_CODEC_CORRUPT_FRAME;
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        vpx_codec_err_t res;
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        data_start += frame_size;
      }
-
-      res = decode_one_iter(ctx, &data_start, data_end, frame_size,
-                            user_priv, deadline);
+    } else {
+      res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
      if (res != VPX_CODEC_OK)
        return res;
+
+      // Extra data detected after the frame.
+      if (data_start < data_end - 1) {
+        set_error_detail(ctx, "Fail to decode frame in parallel mode");
+        return VPX_CODEC_INCAPABLE;
+      }
    }
  } else {
-    while (data_start < data_end) {
-      res = decode_one_iter(ctx, &data_start, data_end,
-                            (uint32_t)(data_end - data_start),
-                            user_priv, deadline);
-      if (res != VPX_CODEC_OK)
-        return res;
+    // Decode in serial mode.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        vpx_codec_err_t res;
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        data_start += frame_size;
+      }
+    } else {
+      while (data_start < data_end) {
+        const uint32_t frame_size = (uint32_t) (data_end - data_start);
+        const vpx_codec_err_t res = decode_one(ctx, &data_start, frame_size,
+                                               user_priv, deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        // Account for suboptimal termination by the encoder.
+        while (data_start < data_end) {
+          const uint8_t marker = read_marker(ctx->decrypt_cb,
+                                             ctx->decrypt_state, data_start);
+          if (marker)
+            break;
+          ++data_start;
+        }
+      }
    }
  }

@@ -424,25 +565,35 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,

  // iter acts as a flip flop, so an image is only returned on the first
  // call to get_frame.
-  if (*iter == NULL && ctx->pbi != NULL) {
+  if (*iter == NULL && ctx->frame_workers != NULL) {
    YV12_BUFFER_CONFIG sd;
    vp9_ppflags_t flags = {0, 0, 0};

-    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
-      VP9_COMMON *cm = &ctx->pbi->common;
-      yuvconfig2image(&ctx->img, &sd, NULL);
-      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_thread_id];
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    if (vp9_get_raw_frame(worker_data->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *const cm = &worker_data->pbi->common;
+      BufferPool *const pool = cm->buffer_pool;
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      yuvconfig2image(&ctx->img, &sd, worker_data->user_priv);
+      ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
      img = &ctx->img;
      *iter = img;
      // Decrease reference count of last output frame in frame parallel mode.
      if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-        --cm->frame_bufs[ctx->last_show_frame].ref_count;
-        if (cm->frame_bufs[ctx->last_show_frame].ref_count == 0) {
-          cm->release_fb_cb(cm->cb_priv,
-              &cm->frame_bufs[ctx->last_show_frame].raw_frame_buffer);
+#if CONFIG_MULTITHREAD
+        pthread_mutex_lock(&cm->buffer_pool->pool_mutex);
+#endif
+        --frame_bufs[ctx->last_show_frame].ref_count;
+        if (frame_bufs[ctx->last_show_frame].ref_count == 0) {
+          pool->release_fb_cb(pool->cb_priv,
+              &frame_bufs[ctx->last_show_frame].raw_frame_buffer);
        }
+#if CONFIG_MULTITHREAD
+        pthread_mutex_unlock(&cm->buffer_pool->pool_mutex);
+#endif
      }
-      ctx->last_show_frame = ctx->pbi->common.new_fb_idx;
+      ctx->last_show_frame = worker_data->pbi->common.new_fb_idx;
    }
  }

@@ -455,7 +606,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
  if (cb_get == NULL || cb_release == NULL) {
    return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->pbi == NULL) {
+  } else if (ctx->frame_workers == NULL) {
    // If the decoder has already been initialized, do not accept changes to
    // the frame buffer functions.
    ctx->get_ext_fb_cb = cb_get;
@@ -471,12 +622,19 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
  vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (data) {
    vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
    YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
    image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&ctx->pbi->common,
+    return vp9_set_reference_dec(&worker_data->pbi->common,
                                 (VP9_REFFRAME)frame->frame_type, &sd);
  } else {
    return VPX_CODEC_INVALID_PARAM;
@@ -487,13 +645,19 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
    YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
    image2yuvconfig(&frame->img, &sd);
-
-    return vp9_copy_reference_dec(ctx->pbi,
+    return vp9_copy_reference_dec(worker_data->pbi,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
  } else {
    return VPX_CODEC_INVALID_PARAM;
@@ -504,10 +668,17 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (data) {
    YV12_BUFFER_CONFIG* fb;
-
-    vp9_get_reference_dec(ctx->pbi, data->idx, &fb);
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    vp9_get_reference_dec(worker_data->pbi, data->idx, &fb);
    yuvconfig2image(&data->img, fb, NULL);
    return VPX_CODEC_OK;
  } else {
@@ -545,11 +716,20 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
  int *const update_info = va_arg(args, int *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (update_info) {
-    if (ctx->pbi)
-      *update_info = ctx->pbi->refresh_frame_flags;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      *update_info = worker_data->pbi->refresh_frame_flags;
+    } else {
      return VPX_CODEC_ERROR;
+    }
    return VPX_CODEC_OK;
  } else {
    return VPX_CODEC_INVALID_PARAM;
@@ -561,11 +741,20 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                                va_list args) {
  int *corrupted = va_arg(args, int *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (corrupted) {
-    if (ctx->pbi)
-      *corrupted = ctx->pbi->common.frame_to_show->corrupted;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      *corrupted = worker_data->pbi->common.frame_to_show->corrupted;
+    } else {
      return VPX_CODEC_ERROR;
+    }
    return VPX_CODEC_OK;
  } else {
    return VPX_CODEC_INVALID_PARAM;
@@ -576,9 +765,17 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
                                             va_list args) {
  int *const display_size = va_arg(args, int *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (display_size) {
-    if (ctx->pbi) {
-      const VP9_COMMON *const cm = &ctx->pbi->common;
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &worker_data->pbi->common;
      display_size[0] = cm->display_width;
      display_size[1] = cm->display_height;
    } else {
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -31,6 +31,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
    img->fmt = VPX_IMG_FMT_I420;
    bps = 12;
  }
+  img->bit_depth = 8;
  img->w = yv12->y_stride;
  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
  img->d_w = yv12->y_crop_width;
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -105,11 +105,9 @@ VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 endif

@@ -124,7 +122,9 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c

-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -24,6 +24,7 @@
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"

 #ifdef __MINGW32__
 #define strtok_r strtok_s
@@ -47,17 +48,22 @@ _CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
 static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
 static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";

+// One encoded frame
+typedef struct FrameData {
+  void                     *buf;    // compressed data buffer
+  size_t                    size;  // length of compressed data
+  vpx_codec_frame_flags_t   flags;    /**< flags for this frame */
+  struct FrameData         *next;
+} FrameData;
+
 typedef struct SvcInternal {
  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
  char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
-  char quantizers_keyframe[OPTION_BUFFER_SIZE];  // set by
-                                                 // vpx_svc_set_quantizers
  char scale_factors[OPTION_BUFFER_SIZE];  // set by vpx_svc_set_scale_factors

  // values extracted from option, quantizers
  int scaling_factor_num[VPX_SS_MAX_LAYERS];
  int scaling_factor_den[VPX_SS_MAX_LAYERS];
-  int quantizer_keyframe[VPX_SS_MAX_LAYERS];
  int quantizer[VPX_SS_MAX_LAYERS];

  // accumulated statistics
@@ -72,15 +78,15 @@ typedef struct SvcInternal {

  // state variables
  int encode_frame_count;
+  int frame_received;
  int frame_within_gop;
  vpx_enc_frame_flags_t enc_frame_flags;
  int layers;
  int layer;
  int is_keyframe;

-  size_t frame_size;
-  size_t buffer_size;
-  void *buffer;
+  FrameData *frame_list;
+  FrameData *frame_temp;

  char *rc_stats_buf;
  size_t rc_stats_buf_size;
@@ -90,128 +96,54 @@ typedef struct SvcInternal {
  vpx_codec_ctx_t *codec_ctx;
 } SvcInternal;

-// Superframe is used to generate an index of individual frames (i.e., layers)
-struct Superframe {
-  int count;
-  uint32_t sizes[SUPERFRAME_SLOTS];
-  uint32_t magnitude;
-  uint8_t buffer[SUPERFRAME_BUFFER_SIZE];
-  size_t index_size;
-};
-
-// One encoded frame layer
-struct LayerData {
-  void *buf;    // compressed data buffer
-  size_t size;  // length of compressed data
-  struct LayerData *next;
-};
-
-// create LayerData from encoder output
-static struct LayerData *ld_create(void *buf, size_t size) {
-  struct LayerData *const layer_data =
-      (struct LayerData *)malloc(sizeof(*layer_data));
-  if (layer_data == NULL) {
+// create FrameData from encoder output
+static struct FrameData *fd_create(void *buf, size_t size,
+                                   vpx_codec_frame_flags_t flags) {
+  struct FrameData *const frame_data =
+      (struct FrameData *)vpx_malloc(sizeof(*frame_data));
+  if (frame_data == NULL) {
    return NULL;
  }
-  layer_data->buf = malloc(size);
-  if (layer_data->buf == NULL) {
-    free(layer_data);
+  frame_data->buf = vpx_malloc(size);
+  if (frame_data->buf == NULL) {
+    vpx_free(frame_data);
    return NULL;
  }
-  memcpy(layer_data->buf, buf, size);
-  layer_data->size = size;
-  return layer_data;
+  vpx_memcpy(frame_data->buf, buf, size);
+  frame_data->size = size;
+  frame_data->flags = flags;
+  return frame_data;
 }

-// free LayerData
-static void ld_free(struct LayerData *layer_data) {
-  if (layer_data) {
-    if (layer_data->buf) {
-      free(layer_data->buf);
-      layer_data->buf = NULL;
-    }
-    free(layer_data);
+// free FrameData
+static void fd_free(struct FrameData *p) {
+  if (p) {
+    if (p->buf)
+      vpx_free(p->buf);
+    vpx_free(p);
  }
 }

-// add layer data to list
-static void ld_list_add(struct LayerData **list, struct LayerData *layer_data) {
-  struct LayerData **p = list;
+// add FrameData to list
+static void fd_list_add(struct FrameData **list, struct FrameData *layer_data) {
+  struct FrameData **p = list;

  while (*p != NULL) p = &(*p)->next;
  *p = layer_data;
  layer_data->next = NULL;
 }

-// get accumulated size of layer data
-static size_t ld_list_get_buffer_size(struct LayerData *list) {
-  struct LayerData *p;
-  size_t size = 0;
-
-  for (p = list; p != NULL; p = p->next) {
-    size += p->size;
-  }
-  return size;
-}
-
-// copy layer data to buffer
-static void ld_list_copy_to_buffer(struct LayerData *list, uint8_t *buffer) {
-  struct LayerData *p;
-
-  for (p = list; p != NULL; p = p->next) {
-    buffer[0] = 1;
-    memcpy(buffer, p->buf, p->size);
-    buffer += p->size;
-  }
-}
-
-// free layer data list
-static void ld_list_free(struct LayerData *list) {
-  struct LayerData *p = list;
+// free FrameData list
+static void fd_free_list(struct FrameData *list) {
+  struct FrameData *p = list;

  while (p) {
    list = list->next;
-    ld_free(p);
+    fd_free(p);
    p = list;
  }
 }

-static void sf_create_index(struct Superframe *sf) {
-  uint8_t marker = 0xc0;
-  int i;
-  uint32_t mag, mask;
-  uint8_t *bufp;
-
-  if (sf->count == 0 || sf->count >= 8) return;
-
-  // Add the number of frames to the marker byte
-  marker |= sf->count - 1;
-
-  // Choose the magnitude
-  for (mag = 0, mask = 0xff; mag < 4; ++mag) {
-    if (sf->magnitude < mask) break;
-    mask <<= 8;
-    mask |= 0xff;
-  }
-  marker |= mag << 3;
-
-  // Write the index
-  sf->index_size = 2 + (mag + 1) * sf->count;
-  bufp = sf->buffer;
-
-  *bufp++ = marker;
-  for (i = 0; i < sf->count; ++i) {
-    int this_sz = sf->sizes[i];
-    uint32_t j;
-
-    for (j = 0; j <= mag; ++j) {
-      *bufp++ = this_sz & 0xff;
-      this_sz >>= 8;
-    }
-  }
-  *bufp++ = marker;
-}
-
 static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
  if (svc_ctx == NULL) return NULL;
  if (svc_ctx->internal == NULL) {
@@ -262,26 +194,8 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level,
  return retval;
 }

-static vpx_codec_err_t set_option_encoding_mode(SvcContext *svc_ctx,
-                                                const char *value_str) {
-  if (strcmp(value_str, "i") == 0) {
-    svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_I;
-  } else if (strcmp(value_str, "alt-ip") == 0) {
-    svc_ctx->encoding_mode = ALT_INTER_LAYER_PREDICTION_IP;
-  } else if (strcmp(value_str, "ip") == 0) {
-    svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_IP;
-  } else if (strcmp(value_str, "gf") == 0) {
-    svc_ctx->encoding_mode = USE_GOLDEN_FRAME;
-  } else {
-    svc_log(svc_ctx, SVC_LOG_ERROR, "invalid encoding mode: %s", value_str);
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
-                                              const char *quantizer_values,
-                                              const int is_keyframe) {
+                                              const char *quantizer_values) {
  char *input_string;
  char *token;
  const char *delim = ",";
@@ -292,11 +206,6 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
  SvcInternal *const si = get_svc_internal(svc_ctx);

  if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
-    if (is_keyframe) {
-      // If there non settings for key frame, we will apply settings from
-      // non key frame. So just simply return here.
-      return VPX_CODEC_INVALID_PARAM;
-    }
    input_string = strdup(DEFAULT_QUANTIZER_VALUES);
  } else {
    input_string = strdup(quantizer_values);
@@ -317,12 +226,7 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
    } else {
      q = 0;
    }
-    if (is_keyframe) {
-      si->quantizer_keyframe[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers]
-      = q;
-    } else {
-      si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
-    }
+    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
  }
  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
    svc_log(svc_ctx, SVC_LOG_ERROR,
@@ -407,7 +311,6 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
  char *option_name;
  char *option_value;
  char *input_ptr;
-  int is_keyframe_qaunt_set = 0;
  vpx_codec_err_t res = VPX_CODEC_OK;

  if (options == NULL) return VPX_CODEC_OK;
@@ -424,26 +327,14 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
      res = VPX_CODEC_INVALID_PARAM;
      break;
    }
-    if (strcmp("encoding-mode", option_name) == 0) {
-      res = set_option_encoding_mode(svc_ctx, option_value);
-      if (res != VPX_CODEC_OK) break;
-    } else if (strcmp("layers", option_name) == 0) {
+    if (strcmp("layers", option_name) == 0) {
      svc_ctx->spatial_layers = atoi(option_value);
    } else if (strcmp("scale-factors", option_name) == 0) {
      res = parse_scale_factors(svc_ctx, option_value);
      if (res != VPX_CODEC_OK) break;
    } else if (strcmp("quantizers", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value, 0);
+      res = parse_quantizer_values(svc_ctx, option_value);
      if (res != VPX_CODEC_OK) break;
-      if (!is_keyframe_qaunt_set) {
-        SvcInternal *const si = get_svc_internal(svc_ctx);
-        memcpy(get_svc_internal(svc_ctx)->quantizer_keyframe, si->quantizer,
-               sizeof(si->quantizer));
-      }
-    } else if (strcmp("quantizers-keyframe", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value, 1);
-      if (res != VPX_CODEC_OK) break;
-      is_keyframe_qaunt_set = 1;
    } else {
      svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
      res = VPX_CODEC_INVALID_PARAM;
@@ -466,19 +357,13 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
 }

 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizers,
-                                       const int is_for_keyframe) {
+                                       const char *quantizers) {
  SvcInternal *const si = get_svc_internal(svc_ctx);
  if (svc_ctx == NULL || quantizers == NULL || si == NULL) {
    return VPX_CODEC_INVALID_PARAM;
  }
-  if (is_for_keyframe) {
-    strncpy(si->quantizers_keyframe, quantizers, sizeof(si->quantizers));
-    si->quantizers_keyframe[sizeof(si->quantizers_keyframe) - 1] = '\0';
-  } else {
-    strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
-    si->quantizers[sizeof(si->quantizers) - 1] = '\0';
-  }
+  strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
+  si->quantizers[sizeof(si->quantizers) - 1] = '\0';
  return VPX_CODEC_OK;
 }

@@ -525,13 +410,9 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    return VPX_CODEC_INVALID_PARAM;
  }

-  res = parse_quantizer_values(svc_ctx, si->quantizers, 0);
+  res = parse_quantizer_values(svc_ctx, si->quantizers);
  if (res != VPX_CODEC_OK) return res;

-  res = parse_quantizer_values(svc_ctx, si->quantizers_keyframe, 1);
-  if (res != VPX_CODEC_OK)
-    memcpy(si->quantizer_keyframe, si->quantizer, sizeof(si->quantizer));
-
  res = parse_scale_factors(svc_ctx, si->scale_factors);
  if (res != VPX_CODEC_OK) return res;

@@ -574,8 +455,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
  // modify encoder configuration
  enc_cfg->ss_number_layers = si->layers;
  enc_cfg->ts_number_layers = 1;  // Temporal layers not used in this encoder.
-  // Lag in frames not currently supported
-  enc_cfg->g_lag_in_frames = 0;

  // TODO(ivanmaltz): determine if these values need to be set explicitly for
  // svc, or if the normal default/override mechanism can be used
@@ -608,6 +487,34 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
  return VPX_CODEC_OK;
 }

+static void accumulate_frame_size_for_each_layer(SvcInternal *const si,
+                                                 const uint8_t *const buf,
+                                                 const size_t size) {
+  uint8_t marker = buf[size - 1];
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    uint8_t marker2 = buf[size - index_sz];
+
+    if (size >= index_sz && marker2 == marker) {
+      // found a valid superframe index
+      uint32_t i, j;
+      const uint8_t *x = &buf[size - index_sz + 1];
+
+      // frames has a maximum of 8 and mag has a maximum of 4.
+      for (i = 0; i < frames; i++) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; j++)
+          this_sz |= (*x++) << (j * 8);
+        si->bytes_sum[i] += this_sz;
+      }
+    }
+  }
+}
+
 // SVC Algorithm flags - these get mapped to VP8_EFLAG_* defined in vp8cx.h

 // encoder should reference the last frame
@@ -664,62 +571,14 @@ static void calculate_enc_frame_flags(SvcContext *svc_ctx) {
    return;
  }

-  switch (svc_ctx->encoding_mode) {
-    case ALT_INTER_LAYER_PREDICTION_IP:
-      if (si->layer == 0) {
-        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
-      } else if (is_keyframe) {
-        if (si->layer == si->layers - 1) {
-          flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
-        } else {
-          flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF);
-        }
-      } else {
-        flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
-      }
-      break;
-    case INTER_LAYER_PREDICTION_I:
-      if (si->layer == 0) {
-        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
-      } else if (is_keyframe) {
-        flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
-      } else {
-        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
-      }
-      break;
-    case INTER_LAYER_PREDICTION_IP:
-      if (si->layer == 0) {
-        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
-      } else if (is_keyframe) {
-        flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
-      } else {
-        flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
-      }
-      break;
-    case USE_GOLDEN_FRAME:
-      if (2 * si->layers - SVC_REFERENCE_FRAMES <= si->layer) {
-        if (si->layer == 0) {
-          flags = map_vp8_flags(USE_LAST | USE_GF | UPDATE_LAST);
-        } else if (is_keyframe) {
-          flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF);
-        } else {
-          flags = map_vp8_flags(USE_LAST | USE_ARF | USE_GF | UPDATE_LAST);
-        }
-      } else {
-        if (si->layer == 0) {
-          flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
-        } else if (is_keyframe) {
-          flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
-        } else {
-          flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
-        }
-      }
-      break;
-    default:
-      svc_log(svc_ctx, SVC_LOG_ERROR, "unexpected encoding mode: %d\n",
-              svc_ctx->encoding_mode);
-      break;
+  if (si->layer == 0) {
+    flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+  } else if (is_keyframe) {
+    flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+  } else {
+    flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
  }
+
  si->enc_frame_flags = flags;
 }

@@ -765,13 +624,6 @@ static void set_svc_parameters(SvcContext *svc_ctx,
  svc_params.flags = si->enc_frame_flags;

  layer = si->layer;
-  if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
-      si->frame_within_gop == 0) {
-    // layers 1 & 3 don't exist in this mode, use the higher one
-    if (layer == 0 || layer == 2) {
-      layer += 1;
-    }
-  }
  if (VPX_CODEC_OK != vpx_svc_get_layer_resolution(svc_ctx, layer,
                                                   &svc_params.width,
                                                   &svc_params.height)) {
@@ -780,13 +632,8 @@ static void set_svc_parameters(SvcContext *svc_ctx,
  layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;

  if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) {
-    if (vpx_svc_is_keyframe(svc_ctx)) {
-      svc_params.min_quantizer = si->quantizer_keyframe[layer_index];
-      svc_params.max_quantizer = si->quantizer_keyframe[layer_index];
-    } else {
-      svc_params.min_quantizer = si->quantizer[layer_index];
-      svc_params.max_quantizer = si->quantizer[layer_index];
-    }
+    svc_params.min_quantizer = si->quantizer[layer_index];
+    svc_params.max_quantizer = si->quantizer[layer_index];
  } else {
    svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer;
    svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;
@@ -798,21 +645,8 @@ static void set_svc_parameters(SvcContext *svc_ctx,
  svc_params.lst_fb_idx = si->layer;

  // Use buffer i-1 for layer i Alt (Inter-layer prediction)
-  if (si->layer != 0) {
-    const int use_higher_layer =
-        svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
-        si->frame_within_gop == 0;
-    svc_params.alt_fb_idx = use_higher_layer ? si->layer - 2 : si->layer - 1;
-  }
-
-  if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP) {
-    svc_params.gld_fb_idx = si->layer + 1;
-  } else {
-    if (si->layer < 2 * si->layers - SVC_REFERENCE_FRAMES)
-      svc_params.gld_fb_idx = svc_params.lst_fb_idx;
-    else
-      svc_params.gld_fb_idx = 2 * si->layers - 1 - si->layer;
-  }
+  svc_params.alt_fb_idx = (si->layer > 0) ? si->layer - 1 : 0;
+  svc_params.gld_fb_idx = svc_params.lst_fb_idx;

  svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, layer: %d, %dx%d, q: %d\n",
          si->encode_frame_count, si->layer, svc_params.width,
@@ -846,15 +680,12 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
  vpx_codec_err_t res;
  vpx_codec_iter_t iter;
  const vpx_codec_cx_pkt_t *cx_pkt;
-  struct LayerData *cx_layer_list = NULL;
-  struct LayerData *layer_data;
-  struct Superframe superframe;
+  int layer_for_psnr = 0;
  SvcInternal *const si = get_svc_internal(svc_ctx);
  if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
    return VPX_CODEC_INVALID_PARAM;
  }

-  memset(&superframe, 0, sizeof(superframe));
  svc_log_reset(svc_ctx);
  si->rc_stats_buf_used = 0;

@@ -863,7 +694,6 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    si->frame_within_gop = 0;
  }
  si->is_keyframe = (si->frame_within_gop == 0);
-  si->frame_size = 0;

  if (rawimg != NULL) {
    svc_log(svc_ctx, SVC_LOG_DEBUG,
@@ -872,124 +702,85 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
            si->frame_within_gop);
  }

-  // encode each layer
-  for (si->layer = 0; si->layer < si->layers; ++si->layer) {
-    if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
-        si->is_keyframe && (si->layer == 1 || si->layer == 3)) {
-      svc_log(svc_ctx, SVC_LOG_DEBUG, "Skip encoding layer %d\n", si->layer);
-      continue;
-    }
-
-    if (rawimg != NULL) {
+  if (rawimg != NULL) {
+    // encode each layer
+    for (si->layer = 0; si->layer < si->layers; ++si->layer) {
      calculate_enc_frame_flags(svc_ctx);
      set_svc_parameters(svc_ctx, codec_ctx);
    }
+  }

-    res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration,
-                           si->enc_frame_flags, deadline);
-    if (res != VPX_CODEC_OK) {
-      return res;
-    }
-    // save compressed data
-    iter = NULL;
-    while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
-      switch (cx_pkt->kind) {
-        case VPX_CODEC_CX_FRAME_PKT: {
-          const uint32_t frame_pkt_size = (uint32_t)(cx_pkt->data.frame.sz);
-          si->bytes_sum[si->layer] += frame_pkt_size;
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, size: %u\n",
-                  si->encode_frame_count, si->layer, frame_pkt_size);
-          layer_data =
-              ld_create(cx_pkt->data.frame.buf, (size_t)frame_pkt_size);
-          if (layer_data == NULL) {
-            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating LayerData\n");
-            return VPX_CODEC_OK;
+  res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0,
+                         deadline);
+  if (res != VPX_CODEC_OK) {
+    return res;
+  }
+  // save compressed data
+  iter = NULL;
+  while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
+    switch (cx_pkt->kind) {
+      case VPX_CODEC_CX_FRAME_PKT: {
+        fd_list_add(&si->frame_list, fd_create(cx_pkt->data.frame.buf,
+                                               cx_pkt->data.frame.sz,
+                                               cx_pkt->data.frame.flags));
+        accumulate_frame_size_for_each_layer(si, cx_pkt->data.frame.buf,
+                                             cx_pkt->data.frame.sz);
+
+        svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
+                "pts: %d\n", si->frame_received,
+                (cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? 1 : 0,
+                (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+
+        ++si->frame_received;
+        layer_for_psnr = 0;
+        break;
+      }
+      case VPX_CODEC_PSNR_PKT: {
+        int i;
+        svc_log(svc_ctx, SVC_LOG_DEBUG,
+                "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                si->frame_received, layer_for_psnr,
+                cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
+                cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
+        svc_log(svc_ctx, SVC_LOG_DEBUG,
+                "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                si->frame_received, layer_for_psnr,
+                cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
+                cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
+        for (i = 0; i < COMPONENTS; i++) {
+          si->psnr_sum[layer_for_psnr][i] += cx_pkt->data.psnr.psnr[i];
+          si->sse_sum[layer_for_psnr][i] += cx_pkt->data.psnr.sse[i];
+        }
+        ++layer_for_psnr;
+        break;
+      }
+      case VPX_CODEC_STATS_PKT: {
+        size_t new_size = si->rc_stats_buf_used +
+            cx_pkt->data.twopass_stats.sz;
+
+        if (new_size > si->rc_stats_buf_size) {
+          char *p = (char*)realloc(si->rc_stats_buf, new_size);
+          if (p == NULL) {
+            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
+            return VPX_CODEC_MEM_ERROR;
          }
-          ld_list_add(&cx_layer_list, layer_data);
+          si->rc_stats_buf = p;
+          si->rc_stats_buf_size = new_size;
+        }

-          // save layer size in superframe index
-          superframe.sizes[superframe.count++] = frame_pkt_size;
-          superframe.magnitude |= frame_pkt_size;
-          break;
-        }
-        case VPX_CODEC_PSNR_PKT: {
-          int i;
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->encode_frame_count, si->layer,
-                  cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
-                  cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->encode_frame_count, si->layer,
-                  cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
-                  cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
-          for (i = 0; i < COMPONENTS; i++) {
-            si->psnr_sum[si->layer][i] += cx_pkt->data.psnr.psnr[i];
-            si->sse_sum[si->layer][i] += cx_pkt->data.psnr.sse[i];
-          }
-          break;
-        }
-        case VPX_CODEC_STATS_PKT: {
-          size_t new_size = si->rc_stats_buf_used +
-              cx_pkt->data.twopass_stats.sz;
-
-          if (new_size > si->rc_stats_buf_size) {
-            char *p = (char*)realloc(si->rc_stats_buf, new_size);
-            if (p == NULL) {
-              svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
-              break;
-            }
-            si->rc_stats_buf = p;
-            si->rc_stats_buf_size = new_size;
-          }
-
-          memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
-                 cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
-          si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
-          break;
-        }
-        default: {
-          break;
-        }
+        memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
+               cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
+        si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
+        break;
+      }
+      default: {
+        break;
      }
    }
-    if (rawimg == NULL) {
-      break;
-    }
  }
-  if (codec_ctx->config.enc->g_pass != VPX_RC_FIRST_PASS) {
-    // add superframe index to layer data list
-    sf_create_index(&superframe);
-    layer_data = ld_create(superframe.buffer, superframe.index_size);
-    ld_list_add(&cx_layer_list, layer_data);

-    // get accumulated size of layer data
-    si->frame_size = ld_list_get_buffer_size(cx_layer_list);
-    if (si->frame_size > 0) {
-      // all layers encoded, create single buffer with concatenated layers
-      if (si->frame_size > si->buffer_size) {
-        free(si->buffer);
-        si->buffer = malloc(si->frame_size);
-        if (si->buffer == NULL) {
-          ld_list_free(cx_layer_list);
-          return VPX_CODEC_MEM_ERROR;
-        }
-        si->buffer_size = si->frame_size;
-      }
-      // copy layer data into packet
-      ld_list_copy_to_buffer(cx_layer_list, (uint8_t *)si->buffer);
-
-      ld_list_free(cx_layer_list);
-
-      svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
-              "pts: %d\n", si->encode_frame_count, si->is_keyframe,
-              (int)si->frame_size, (int)pts);
-    }
-  }
  if (rawimg != NULL) {
    ++si->frame_within_gop;
    ++si->encode_frame_count;
@@ -1004,16 +795,27 @@ const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
  return si->message_buffer;
 }

-void *vpx_svc_get_buffer(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-  return si->buffer;
+// We will maintain a list of output frame buffers since with lag_in_frame
+// we need to output all frame buffers at the end. vpx_svc_get_buffer() will
+// remove a frame buffer from the list the put it to a temporal pointer, which
+// will be removed at the next vpx_svc_get_buffer() or when closing encoder.
+void *vpx_svc_get_buffer(SvcContext *svc_ctx) {
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return NULL;
+
+  if (si->frame_temp)
+    fd_free(si->frame_temp);
+
+  si->frame_temp = si->frame_list;
+  si->frame_list = si->frame_list->next;
+
+  return si->frame_temp->buf;
 }

 size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) {
  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->frame_size;
+  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
+  return si->frame_list->size;
 }

 int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {
@@ -1024,8 +826,8 @@ int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {

 int vpx_svc_is_keyframe(const SvcContext *svc_ctx) {
  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->is_keyframe;
+  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
+  return (si->frame_list->flags & VPX_FRAME_IS_KEY) != 0;
 }

 void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
@@ -1041,7 +843,7 @@ static double calc_psnr(double d) {

 // dump accumulated statistics and reset accumulated values
 const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
-  int number_of_frames, number_of_keyframes, encode_frame_count;
+  int number_of_frames, encode_frame_count;
  int i, j;
  uint32_t bytes_total = 0;
  double scale[COMPONENTS];
@@ -1058,14 +860,9 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
  if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx);

  svc_log(svc_ctx, SVC_LOG_INFO, "\n");
-  number_of_keyframes = encode_frame_count / si->kf_dist + 1;
  for (i = 0; i < si->layers; ++i) {
    number_of_frames = encode_frame_count;

-    if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
-        (i == 1 || i == 3)) {
-      number_of_frames -= number_of_keyframes;
-    }
    svc_log(svc_ctx, SVC_LOG_INFO,
            "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
            i, (double)si->psnr_sum[i][0] / number_of_frames,
@@ -1112,7 +909,8 @@ void vpx_svc_release(SvcContext *svc_ctx) {
  // SvcInternal if it was not already allocated
  si = (SvcInternal *)svc_ctx->internal;
  if (si != NULL) {
-    free(si->buffer);
+    fd_free(si->frame_temp);
+    fd_free_list(si->frame_list);
    if (si->rc_stats_buf) {
      free(si->rc_stats_buf);
    }
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -40,13 +40,13 @@ static void img_buf_free(void *memblk) {
  }
 }

-static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
-                                     vpx_img_fmt_t fmt,
-                                     unsigned int  d_w,
-                                     unsigned int  d_h,
-                                     unsigned int  buf_align,
-                                     unsigned int  stride_align,
-                                     unsigned char      *img_data) {
+static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
+                                     vpx_img_fmt_t  fmt,
+                                     unsigned int   d_w,
+                                     unsigned int   d_h,
+                                     unsigned int   buf_align,
+                                     unsigned int   stride_align,
+                                     unsigned char *img_data) {

  unsigned int  h, w, s, xcs, ycs, bps;
  int           align;
@@ -94,6 +94,21 @@ static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
    case VPX_IMG_FMT_VPXYV12:
      bps = 12;
      break;
+    case VPX_IMG_FMT_I422:
+      bps = 16;
+      break;
+    case VPX_IMG_FMT_I444:
+      bps = 24;
+      break;
+    case VPX_IMG_FMT_I42016:
+      bps = 24;
+      break;
+    case VPX_IMG_FMT_I42216:
+      bps = 32;
+      break;
+    case VPX_IMG_FMT_I44416:
+      bps = 48;
+      break;
    default:
      bps = 16;
      break;
@@ -105,6 +120,9 @@ static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
    case VPX_IMG_FMT_YV12:
    case VPX_IMG_FMT_VPXI420:
    case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
      xcs = 1;
      break;
    default:
@@ -156,6 +174,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
    goto fail;

  img->fmt = fmt;
+  img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8;
  img->w = w;
  img->h = h;
  img->x_chroma_shift = xcs;
--- a/Show More
+++ b/Show More