Add error handling for frame parallel decode and unit test for that.

Change-Id: I6e309e11f1641618d2424b7a2c0fe744b8974dec
Fix a bug in frame parallel decode and add a unit test for that.
2014-12-08 12:30:19 -08:00 · 2014-11-14 10:16:34 -08:00 · 2014-11-07 10:06:35 -08:00 · 2014-11-06 16:28:35 -08:00 · 2014-11-06 13:51:08 -08:00 · 2014-10-22 10:50:58 -07:00
166 changed files with 8872 additions and 3663 deletions
--- a/4
+++ b/4
@@ -55,6 +55,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv6-linux-rvct
    armv6-linux-gcc
    armv6-none-rvct
+    arm64-darwin-gcc
    armv7-android-gcc
    armv7-darwin-gcc
    armv7-linux-rvct
@@ -62,6 +63,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv7-none-rvct
    armv7-win32-vs11
    armv7-win32-vs12
+    armv7s-darwin-gcc
    mips32-linux-gcc
    ppc32-darwin8-gcc
    ppc32-darwin9-gcc
@@ -79,6 +81,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-darwin11-gcc
    x86-darwin12-gcc
    x86-darwin13-gcc
+    x86-iphonesimulator-gcc
    x86-linux-gcc
    x86-linux-icc
    x86-os2-gcc
@@ -95,6 +98,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-darwin11-gcc
    x86_64-darwin12-gcc
    x86_64-darwin13-gcc
+    x86_64-iphonesimulator-gcc
    x86_64-linux-gcc
    x86_64-linux-icc
    x86_64-solaris-gcc
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -330,7 +330,10 @@ endef
 ifneq ($(target),)
 include $(SRC_PATH_BARE)/$(target:-$(TOOLCHAIN)=).mk
 endif
-ifeq ($(filter %clean,$(MAKECMDGOALS)),)
+
+skip_deps := $(filter %clean,$(MAKECMDGOALS))
+skip_deps += $(findstring testdata,$(MAKECMDGOALS))
+ifeq ($(strip $(skip_deps)),)
  # Older versions of make don't like -include directives with no arguments
  ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
    -include $(filter %.d,$(OBJS-yes:.o=.d))
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -799,10 +799,10 @@ process_common_toolchain() {
    arm*)
        # on arm, isa versions are supersets
        case ${tgt_isa} in
-        armv8)
+        arm64|armv8)
            soft_enable neon
            ;;
-        armv7)
+        armv7|armv7s)
            soft_enable neon
            soft_enable neon_asm
            soft_enable media
@@ -831,7 +831,7 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if [ ${tgt_isa} = "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
                if [ -z "${float_abi}" ]; then
                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
@@ -1048,14 +1048,6 @@ EOF
        esac
    ;;
    x86*)
-        bits=32
-        enabled x86_64 && bits=64
-        check_cpp <<EOF && bits=x32
-#ifndef __ILP32__
-#error "not x32"
-#endif
-EOF
-
        case  ${tgt_os} in
            win*)
                enabled gcc && add_cflags -fno-common
@@ -1094,8 +1086,6 @@ EOF
                esac
            ;;
            gcc*)
-                add_cflags -m${bits}
-                add_ldflags -m${bits}
                link_with_cc=gcc
                tune_cflags="-march="
                setup_gnu_toolchain
@@ -1120,6 +1110,20 @@ EOF
            ;;
        esac

+        bits=32
+        enabled x86_64 && bits=64
+        check_cpp <<EOF && bits=x32
+#ifndef __ILP32__
+#error "not x32"
+#endif
+EOF
+        case ${tgt_cc} in
+            gcc*)
+                add_cflags -m${bits}
+                add_ldflags -m${bits}
+            ;;
+        esac
+
        soft_enable runtime_cpu_detect
        # We can't use 'check_cflags' until the compiler is configured and CC is
        # populated.
@@ -1222,10 +1226,12 @@ EOF
        fi
    fi

-    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o \
-         "${tgt_os#darwin}" = "${tgt_os}"  ]; then
-      soft_enable use_x86inc
+    tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
+    # Default use_x86inc to yes when we are 64 bit, non-pic, or on any
+    # non-Darwin target.
+    if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \
+            [ "${tgt_os_no_version}" != "darwin" ]; then
+        soft_enable use_x86inc
    fi

    # Position Independent Code (PIC) support, for building relocatable
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -137,7 +137,9 @@ for opt in "$@"; do
        ;;
        --lib) proj_kind="lib"
        ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
        ;;
        --static-crt) use_static_runtime=true
        ;;
@@ -151,9 +153,9 @@ for opt in "$@"; do
            esac
        ;;
        -I*)
-            opt="${opt%/}"
            opt=${opt##-I}
            opt=$(fix_path "$opt")
+            opt="${opt%/}"
            incs="${incs}${incs:+;}&quot;${opt}&quot;"
            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
        ;;
@@ -414,7 +416,7 @@ generate_vcproj() {
                    vpx)
                        tag Tool \
                            Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
+                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \

                        tag Tool \
                            Name="VCCLCompilerTool" \
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -157,7 +157,9 @@ for opt in "$@"; do
        ;;
        --lib) proj_kind="lib"
        ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
        ;;
        --static-crt) use_static_runtime=true
        ;;
@@ -173,9 +175,9 @@ for opt in "$@"; do
            esac
        ;;
        -I*)
-            opt="${opt%/}"
            opt=${opt##-I}
            opt=$(fix_path "$opt")
+            opt="${opt%/}"
            incs="${incs}${incs:+;}&quot;${opt}&quot;"
            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
        ;;
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -0,0 +1,248 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##
+## This script generates 'VPX.framework'. An iOS app can encode and decode VPx
+## video by including 'VPX.framework'.
+##
+## Run iosbuild.sh to create 'VPX.framework' in the current directory.
+##
+set -e
+devnull='> /dev/null 2>&1'
+
+BUILD_ROOT="_iosbuild"
+DIST_DIR="_dist"
+FRAMEWORK_DIR="VPX.framework"
+HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
+MAKE_JOBS=1
+LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
+LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+ORIG_PWD="$(pwd)"
+TARGETS="arm64-darwin-gcc
+         armv6-darwin-gcc
+         armv7-darwin-gcc
+         armv7s-darwin-gcc
+         x86-iphonesimulator-gcc
+         x86_64-iphonesimulator-gcc"
+
+# Configures for the target specified by $1, and invokes make with the dist
+# target using $DIST_DIR as the distribution output directory.
+build_target() {
+  local target="$1"
+  local old_pwd="$(pwd)"
+
+  vlog "***Building target: ${target}***"
+
+  mkdir "${target}"
+  cd "${target}"
+  eval "../../${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+      --disable-docs ${devnull}
+  export DIST_DIR
+  eval make -j ${MAKE_JOBS} dist ${devnull}
+  cd "${old_pwd}"
+
+  vlog "***Done building target: ${target}***"
+}
+
+# Returns the preprocessor symbol for the target specified by $1.
+target_to_preproc_symbol() {
+  target="$1"
+  case "${target}" in
+    arm64-*)
+      echo "__aarch64__"
+      ;;
+    armv6-*)
+      echo "__ARM_ARCH_6__"
+      ;;
+    armv7-*)
+      echo "__ARM_ARCH_7__"
+      ;;
+    armv7s-*)
+      echo "__ARM_ARCH_7S__"
+      ;;
+    x86-*)
+      echo "__i386__"
+      ;;
+    x86_64-*)
+      echo "__x86_64__"
+      ;;
+    *)
+      echo "#error ${target} unknown/unsupported"
+      return 1
+      ;;
+  esac
+}
+
+# Create a vpx_config.h shim that, based on preprocessor settings for the
+# current target CPU, includes the real vpx_config.h for the current target.
+# $1 is the list of targets.
+create_vpx_framework_config_shim() {
+  local targets="$1"
+  local config_file="${HEADER_DIR}/vpx_config.h"
+  local preproc_symbol=""
+  local target=""
+  local include_guard="VPX_FRAMEWORK_HEADERS_VPX_VPX_CONFIG_H_"
+
+  local file_header="/*
+ *  Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* GENERATED FILE: DO NOT EDIT! */
+
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#if defined"
+
+  printf "%s" "${file_header}" > "${config_file}"
+  for target in ${targets}; do
+    preproc_symbol=$(target_to_preproc_symbol "${target}")
+    printf " ${preproc_symbol}\n" >> "${config_file}"
+    printf "#include \"VPX/vpx/${target}/vpx_config.h\"\n" >> "${config_file}"
+    printf "#elif defined" >> "${config_file}"
+    mkdir "${HEADER_DIR}/${target}"
+    cp -p "${BUILD_ROOT}/${target}/vpx_config.h" "${HEADER_DIR}/${target}"
+  done
+
+  # Consume the last line of output from the loop: We don't want it.
+  sed -i '' -e '$d' "${config_file}"
+
+  printf "#endif\n\n" >> "${config_file}"
+  printf "#endif  // ${include_guard}" >> "${config_file}"
+}
+
+# Configures and builds each target specified by $1, and then builds
+# VPX.framework.
+build_framework() {
+  local lib_list=""
+  local targets="$1"
+  local target=""
+  local target_dist_dir=""
+
+  # Clean up from previous build(s).
+  rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}"
+
+  # Create output dirs.
+  mkdir -p "${BUILD_ROOT}"
+  mkdir -p "${HEADER_DIR}"
+
+  cd "${BUILD_ROOT}"
+
+  for target in ${targets}; do
+    build_target "${target}"
+    target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
+    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.a"
+  done
+
+  cd "${ORIG_PWD}"
+
+  # The basic libvpx API includes are all the same; just grab the most recent
+  # set.
+  cp -p "${target_dist_dir}"/include/vpx/* "${HEADER_DIR}"
+
+  # Build the fat library.
+  ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/VPX
+
+  # Create the vpx_config.h shim that allows usage of vpx_config.h from
+  # within VPX.framework.
+  create_vpx_framework_config_shim "${targets}"
+
+  # Copy in vpx_version.h.
+  cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"
+
+  vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
+  for lib in ${lib_list}; do
+    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
+  done
+
+  # TODO(tomfinegan): Verify that expected targets are included within
+  # VPX.framework/VPX via lipo -info.
+}
+
+# Trap function. Cleans up the subtree used to build all targets contained in
+# $TARGETS.
+cleanup() {
+  cd "${ORIG_PWD}"
+
+  if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
+    rm -rf "${BUILD_ROOT}"
+  fi
+}
+
+iosbuild_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --help: Display this message and exit.
+    --jobs: Number of make jobs.
+    --preserve-build-output: Do not delete the build directory.
+    --show-build-output: Show output from each library build.
+    --verbose: Output information about the environment and each stage of the
+               build.
+EOF
+}
+
+vlog() {
+  if [ "${VERBOSE}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+trap cleanup EXIT
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --help)
+      iosbuild_usage
+      exit
+      ;;
+    --jobs)
+      MAKE_JOBS="$2"
+      shift
+      ;;
+    --preserve-build-output)
+      PRESERVE_BUILD_OUTPUT=yes
+      ;;
+    --show-build-output)
+      devnull=
+      ;;
+    --verbose)
+      VERBOSE=yes
+      ;;
+    *)
+      iosbuild_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [ "${VERBOSE}" = "yes" ]; then
+cat << EOF
+  BUILD_ROOT=${BUILD_ROOT}
+  DIST_DIR=${DIST_DIR}
+  FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  HEADER_DIR=${HEADER_DIR}
+  MAKE_JOBS=${MAKE_JOBS}
+  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
+  LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
+  LIPO=${LIPO}
+  ORIG_PWD=${ORIG_PWD}
+  TARGETS="${TARGETS}"
+EOF
+fi
+
+build_framework "${TARGETS}"
--- a/4
+++ b/4
@@ -96,6 +96,7 @@ all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
+all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -103,6 +104,7 @@ all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-vs11"
 all_platforms="${all_platforms} armv7-win32-vs12"
+all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -271,6 +273,8 @@ EXPERIMENT_LIST="
    alpha
    multiple_arf
    spatial_svc
+    denoising
+    fp_mb_stats
 "
 CONFIG_LIST="
    external_build
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -28,16 +28,6 @@
 #include "vpx/vpx_encoder.h"
 #include "./vpxstats.h"

-static const struct arg_enum_list encoding_mode_enum[] = {
-  {"i", INTER_LAYER_PREDICTION_I},
-  {"alt-ip", ALT_INTER_LAYER_PREDICTION_IP},
-  {"ip", INTER_LAYER_PREDICTION_IP},
-  {"gf", USE_GOLDEN_FRAME},
-  {NULL, 0}
-};
-
-static const arg_def_t encoding_mode_arg = ARG_DEF_ENUM(
-    "m", "encoding-mode", 1, "Encoding mode algorithm", encoding_mode_enum);
 static const arg_def_t skip_frames_arg =
    ARG_DEF("s", "skip-frames", 1, "input frames to skip");
 static const arg_def_t frames_arg =
@@ -58,9 +48,6 @@ static const arg_def_t quantizers_arg =
    ARG_DEF("q", "quantizers", 1, "quantizers for non key frames, also will "
            "be applied to key frames if -qn is not specified (lowest to "
            "highest layer)");
-static const arg_def_t quantizers_keyframe_arg =
-    ARG_DEF("qn", "quantizers-keyframe", 1, "quantizers for key frames (lowest "
-        "to highest layer)");
 static const arg_def_t passes_arg =
    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
 static const arg_def_t pass_arg =
@@ -77,16 +64,13 @@ static const arg_def_t max_bitrate_arg =
    ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");

 static const arg_def_t *svc_args[] = {
-  &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
+  &frames_arg,        &width_arg,         &height_arg,
  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
-  &quantizers_keyframe_arg,               &passes_arg,      &pass_arg,
-  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
-  &max_bitrate_arg,   NULL
+  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  &passes_arg,
+  &pass_arg,          &fpf_name_arg,      &min_q_arg,       &max_q_arg,
+  &min_bitrate_arg,   &max_bitrate_arg,   NULL
 };

-static const SVC_ENCODING_MODE default_encoding_mode =
-    INTER_LAYER_PREDICTION_IP;
 static const uint32_t default_frames_to_skip = 0;
 static const uint32_t default_frames_to_code = 60 * 60;
 static const uint32_t default_width = 1920;
@@ -135,7 +119,6 @@ static void parse_command_line(int argc, const char **argv_,
  // initialize SvcContext with parameters that will be passed to vpx_svc_init
  svc_ctx->log_level = SVC_LOG_DEBUG;
  svc_ctx->spatial_layers = default_spatial_layers;
-  svc_ctx->encoding_mode = default_encoding_mode;

  // start with default encoder configuration
  res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -161,9 +144,7 @@ static void parse_command_line(int argc, const char **argv_,
  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
    arg.argv_step = 1;

-    if (arg_match(&arg, &encoding_mode_arg, argi)) {
-      svc_ctx->encoding_mode = arg_parse_enum_or_int(&arg);
-    } else if (arg_match(&arg, &frames_arg, argi)) {
+    if (arg_match(&arg, &frames_arg, argi)) {
      app_input->frames_to_code = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &width_arg, argi)) {
      enc_cfg->g_w = arg_parse_uint(&arg);
@@ -183,9 +164,7 @@ static void parse_command_line(int argc, const char **argv_,
    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
      vpx_svc_set_scale_factors(svc_ctx, arg.val);
    } else if (arg_match(&arg, &quantizers_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val, 0);
-    } else if (arg_match(&arg, &quantizers_keyframe_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val, 1);
+      vpx_svc_set_quantizers(svc_ctx, arg.val);
    } else if (arg_match(&arg, &passes_arg, argi)) {
      passes = arg_parse_uint(&arg);
      if (passes < 1 || passes > 2) {
@@ -270,12 +249,12 @@ static void parse_command_line(int argc, const char **argv_,

  printf(
      "Codec %s\nframes: %d, skip: %d\n"
-      "mode: %d, layers: %d\n"
+      "layers: %d\n"
      "width %d, height: %d,\n"
      "num: %d, den: %d, bitrate: %d,\n"
      "gop size: %d\n",
      vpx_codec_iface_name(vpx_codec_vp9_cx()), app_input->frames_to_code,
-      app_input->frames_to_skip, svc_ctx->encoding_mode,
+      app_input->frames_to_skip,
      svc_ctx->spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
@@ -296,6 +275,7 @@ int main(int argc, const char **argv) {
  int frame_duration = 1; /* 1 timebase tick per frame */
  FILE *infile = NULL;
  int end_of_stream = 0;
+  int frame_size;

  memset(&svc_ctx, 0, sizeof(svc_ctx));
  svc_ctx.log_print = 1;
@@ -351,11 +331,10 @@ int main(int argc, const char **argv) {
      die_codec(&codec, "Failed to encode frame");
    }
    if (!(app_input.passes == 2 && app_input.pass == 1)) {
-      if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
+      while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
        vpx_video_writer_write_frame(writer,
                                     vpx_svc_get_buffer(&svc_ctx),
-                                     vpx_svc_get_frame_size(&svc_ctx),
-                                     pts);
+                                     frame_size, pts);
      }
    }
    if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
--- a/libs.mk
+++ b/libs.mk
@@ -170,7 +170,7 @@ CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-CODEC_SRCS-$(BUILD_LIBVPX) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
 endif
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -35,6 +35,10 @@ class CodecFactory {
  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                 unsigned long deadline) const = 0;

+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const = 0;  // NOLINT
+
  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
                                 unsigned long deadline,
                                 const unsigned long init_flags,
@@ -72,6 +76,10 @@ class VP8Decoder : public Decoder {
  VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
      : Decoder(cfg, deadline) {}

+  VP8Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
 protected:
  virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP8_DECODER
@@ -104,8 +112,14 @@ class VP8CodecFactory : public CodecFactory {

  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP8_DECODER
-    return new VP8Decoder(cfg, deadline);
+    return new VP8Decoder(cfg, flags, deadline);
 #else
    return NULL;
 #endif
@@ -154,6 +168,10 @@ class VP9Decoder : public Decoder {
  VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
      : Decoder(cfg, deadline) {}

+  VP9Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
 protected:
  virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP9_DECODER
@@ -186,8 +204,14 @@ class VP9CodecFactory : public CodecFactory {

  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP9_DECODER
-    return new VP9Decoder(cfg, deadline);
+    return new VP9Decoder(cfg, flags, deadline);
 #else
    return NULL;
 #endif
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -645,6 +645,26 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
 #endif

 #if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h);
+void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h);
+}
+
 const ConvolveFunctions convolve8_avx2(
    vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
    vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
@@ -655,8 +675,10 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
    make_tuple(8, 4, &convolve8_avx2),
    make_tuple(4, 8, &convolve8_avx2),
    make_tuple(8, 8, &convolve8_avx2),
+    make_tuple(8, 16, &convolve8_avx2)));
+
+INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
    make_tuple(16, 8, &convolve8_avx2),
-    make_tuple(8, 16, &convolve8_avx2),
    make_tuple(16, 16, &convolve8_avx2),
    make_tuple(32, 16, &convolve8_avx2),
    make_tuple(16, 32, &convolve8_avx2),
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -14,30 +14,49 @@
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"

 namespace {

+const int kMaxPSNR = 100;
+
 class CpuSpeedTest : public ::libvpx_test::EncoderTest,
    public ::libvpx_test::CodecTestWith2Params<
        libvpx_test::TestMode, int> {
 protected:
-  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
+  CpuSpeedTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)),
+        min_psnr_(kMaxPSNR) {}
  virtual ~CpuSpeedTest() {}

  virtual void SetUp() {
    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPSNR;
  }

  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
-      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
-      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
    }
  }

@@ -45,7 +64,15 @@ class CpuSpeedTest : public ::libvpx_test::EncoderTest,
    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
    }
  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_)
+      min_psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
  int set_cpu_used_;
+  double min_psnr_;
 };

 TEST_P(CpuSpeedTest, TestQ0) {
@@ -53,7 +80,6 @@ TEST_P(CpuSpeedTest, TestQ0) {
  // without a mismatch when passing in a very low max q.  This pushes
  // the encoder to producing lots of big partitions which will likely
  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 400;
@@ -63,16 +89,32 @@ TEST_P(CpuSpeedTest, TestQ0) {
  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
                                       20);

+  init_flags_ = VPX_CODEC_USE_PSNR;
+
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
 }

+TEST_P(CpuSpeedTest, TestScreencastQ0) {
+  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}

 TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
  // Validate that this non multiple of 64 wide clip encodes and decodes
  // without a mismatch when passing in a very low max q.  This pushes
  // the encoder to producing lots of big partitions which will likely
  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 12000;
@@ -89,7 +131,6 @@ TEST_P(CpuSpeedTest, TestLowBitrate) {
  // when passing in a very high min q.  This pushes the encoder to producing
  // lots of small partitions which might will test the other condition.

-  cfg_.g_lag_in_frames = 25;
  cfg_.rc_2pass_vbr_minsection_pct = 5;
  cfg_.rc_2pass_vbr_minsection_pct = 2000;
  cfg_.rc_target_bitrate = 200;
@@ -108,6 +149,7 @@ using std::tr1::make_tuple;

 VP9_INSTANTIATE_TEST_CASE(
    CpuSpeedTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+                      ::libvpx_test::kRealTime),
    ::testing::Range(0, 8));
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -576,7 +576,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
    // Expect some frame drops in this test: for this 200 frames test,
    // expect at least 10% and not more than 60% drops.
    ASSERT_GE(num_drops_, 20);
-    ASSERT_LE(num_drops_, 120);
+    ASSERT_LE(num_drops_, 130);
  }
 }

--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -606,4 +606,29 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::Values(
        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
 #endif
+
+#if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride);
+void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, int stride,
+                       int tx_type);
+}
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct16x16_avx2,
+                   &vp9_idct16x16_256_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 3)));
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 0),
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 1),
+        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 2)));
+#endif
 }  // namespace
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -15,27 +15,73 @@

 namespace libvpx_test {

+const char kVP8Name[] = "WebM Project VP8";
+
+vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+                                    vpx_codec_stream_info_t *stream_info) {
+  return vpx_codec_peek_stream_info(CodecInterface(),
+                                    cxdata, static_cast<unsigned int>(size),
+                                    stream_info);
+}
+
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+  return DecodeFrame(cxdata, size, NULL);
+}
+
+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+                                     void *user_priv) {
  vpx_codec_err_t res_dec;
  InitOnce();
  REGISTER_STATE_CHECK(
      res_dec = vpx_codec_decode(&decoder_,
                                 cxdata, static_cast<unsigned int>(size),
-                                 NULL, 0));
+                                 user_priv, 0));
  return res_dec;
 }

 void DecoderTest::RunLoop(CompressedVideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
-  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
+  Decoder* const decoder = codec_->CreateDecoder(cfg_, flags_, 0);
  ASSERT_TRUE(decoder != NULL);
+  const char *codec_name = decoder->GetDecoderName();
+  const bool is_vp8 = strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
+  bool end_of_file = false;

  // Decode frames.
-  for (video->Begin(); video->cxdata(); video->Next()) {
+  for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
+       video->Next()) {
    PreDecodeFrameHook(*video, decoder);
-    vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
-                                                   video->frame_size());
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+
+    vpx_codec_stream_info_t stream_info;
+    stream_info.sz = sizeof(stream_info);
+
+    if (video->cxdata() != NULL) {
+      const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
+                                                           video->frame_size(),
+                                                           &stream_info);
+      if (is_vp8) {
+        /* Vp8's implementation of PeekStream returns an error if the frame you
+         * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the
+         * first frame, which must be a keyframe. */
+        if (video->frame_number() == 0)
+          ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+              << vpx_codec_err_to_string(res_peek);
+      } else {
+        /* The Vp9 implementation of PeekStream returns an error only if the
+         * data passed to it isn't a valid Vp9 chunk. */
+        ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+            << vpx_codec_err_to_string(res_peek);
+      }
+
+      vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
+                                                     video->frame_size());
+      if (!HandleDecodeResult(res_dec, *video, decoder))
+        break;
+    } else {
+      // Signal end of the file to the decoder.
+      const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+      end_of_file = true;
+    }

    DxDataIterator dec_iter = decoder->GetDxData();
    const vpx_image_t *img = NULL;
@@ -47,4 +93,12 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {

  delete decoder;
 }
+
+void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) {
+  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const vpx_codec_flags_t flags) {
+  flags_ = flags;
+}
 }  // namespace libvpx_test
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -41,7 +41,13 @@ class DxDataIterator {
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
-      : cfg_(cfg), deadline_(deadline), init_done_(false) {
+      : cfg_(cfg), flags_(0), deadline_(deadline), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+          unsigned long deadline)  // NOLINT
+      : cfg_(cfg), flags_(flag), deadline_(deadline), init_done_(false) {
    memset(&decoder_, 0, sizeof(decoder_));
  }

@@ -49,8 +55,14 @@ class Decoder {
    vpx_codec_destroy(&decoder_);
  }

+  vpx_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+                             vpx_codec_stream_info_t *stream_info);
+
  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);

+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+                              void *user_priv);
+
  DxDataIterator GetDxData() {
    return DxDataIterator(&decoder_);
  }
@@ -85,6 +97,10 @@ class Decoder {
        &decoder_, cb_get, cb_release, user_priv);
  }

+  const char* GetDecoderName() {
+    return vpx_codec_iface_name(CodecInterface());
+  }
+
 protected:
  virtual vpx_codec_iface_t* CodecInterface() const = 0;

@@ -92,7 +108,7 @@ class Decoder {
    if (!init_done_) {
      const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
                                                     CodecInterface(),
-                                                     &cfg_, 0);
+                                                     &cfg_, flags_);
      ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
      init_done_ = true;
    }
@@ -100,6 +116,7 @@ class Decoder {

  vpx_codec_ctx_t     decoder_;
  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
  unsigned int        deadline_;
  bool                init_done_;
 };
@@ -110,20 +127,35 @@ class DecoderTest {
  // Main decoding loop
  virtual void RunLoop(CompressedVideoSource *video);

+  virtual void set_cfg(const vpx_codec_dec_cfg_t &dec_cfg);
+  virtual void set_flags(const vpx_codec_flags_t flags);
+
  // Hook to be called before decompressing every frame.
  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
                                  Decoder *decoder) {}

+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const CompressedVideoSource& /* video */,
+                                  Decoder *decoder) {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
  // Hook to be called on every decompressed frame.
  virtual void DecompressedFrameHook(const vpx_image_t& img,
                                     const unsigned int frame_number) {}

 protected:
-  explicit DecoderTest(const CodecFactory *codec) : codec_(codec) {}
+  explicit DecoderTest(const CodecFactory *codec) : codec_(codec), flags_(0) {
+    memset(&cfg_, 0, sizeof(cfg_));
+  }

  virtual ~DecoderTest() {}

  const CodecFactory *codec_;
+  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
 };

 }  // namespace libvpx_test
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -34,7 +34,10 @@ decode_to_md5() {
  local expected_md5="$3"
  local output_file="${VPX_TEST_OUTPUT_DIR}/decode_to_md5_${codec}"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}

--- a/test/decode_with_drops.sh
+++ b/test/decode_with_drops.sh
@@ -34,7 +34,10 @@ decode_with_drops() {
  local output_file="${VPX_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
  local drop_mode="$3"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" "${drop_mode}" ${devnull}

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -376,4 +376,19 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
 #endif

+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_avx2,
+                   &vp9_idct4x4_16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 0),
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 1),
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 2),
+        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 3)));
+#endif
+
 }  // namespace
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -367,4 +367,18 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::Values(
        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
 #endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_avx2, &vp9_idct8x8_64_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    AVX2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 0),
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 1),
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 2),
+        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 3)));
+#endif
 }  // namespace
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+class InvalidFileTest
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL)
+      fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name_) {
+    res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
+    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
+        << res_file_name_;
+  }
+
+  virtual bool HandleDecodeResult(
+      const vpx_codec_err_t res_dec,
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    // Check results match.
+    EXPECT_EQ(expected_res_dec, res_dec)
+        << "Results don't match: frame number = " << video.frame_number();
+
+    return !HasFailure();
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) {
+  const std::string filename = GET_PARAM(1);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+#if CONFIG_WEBM_IO
+    video = new libvpx_test::WebMVideoSource(filename);
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  video->Init();
+
+  // Construct result file name. The file holds a list of expected integer
+  // results, one for each decoded frame.  Any result that doesn't match
+  // the files list will cause a test failure.
+  const std::string res_filename = filename + ".res";
+  OpenResFile(res_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+const char *const kVP9InvalidFileTests[] = {
+  "invalid-vp90-01.webm",
+  "invalid-vp90-02.webm",
+  "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf",
+  "invalid-vp90-03-v3.webm",
+  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf",
+  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf",
+};
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP9InvalidFileTests,
+                                              kVP9InvalidFileTests +
+                                              NELEMENTS(kVP9InvalidFileTests)));
+
+}  // namespace
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -28,10 +28,11 @@ class MD5 {
      // plane, we never want to round down and thus skip a pixel so if
      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
      // This works only for chroma_shift of 0 and 1.
+      const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
                    img->y_chroma_shift : img->d_h;
-      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
-                    img->x_chroma_shift : img->d_w;
+      const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                     img->x_chroma_shift : img->d_w) * bytes_per_sample;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
--- a/test/postproc.sh
+++ b/test/postproc.sh
@@ -32,7 +32,10 @@ postproc() {
  local codec="$2"
  local output_file="${VPX_TEST_OUTPUT_DIR}/postproc_${codec}.raw"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}

--- a/test/resize_util.sh
+++ b/test/resize_util.sh
@@ -33,7 +33,10 @@ resize_util() {

  # resize_util is available only when CONFIG_SHARED is disabled.
  if [ -z "$(vpx_config_option_enabled CONFIG_SHARED)" ]; then
-    [ -x "${resizer}" ] || return 1
+    if [ ! -x "${resizer}" ]; then
+      elog "${resizer} does not exist or is not executable."
+      return 1
+    fi

    eval "${resizer}" "${YUV_RAW_INPUT}" \
        "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -627,4 +627,24 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3

+#if HAVE_AVX2
+#if CONFIG_VP9_ENCODER
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_ptr[], int ref_stride,
+                          unsigned int *sad_array);
+void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_ptr[], int ref_stride,
+                          unsigned int *sad_array);
+}
+const sad_n_by_n_by_4_fn_t sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
+const sad_n_by_n_by_4_fn_t sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
+INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values(
+                        make_tuple(32, 32, sad_32x32x4d_avx2),
+                        make_tuple(64, 64, sad_64x64x4d_avx2)));
+#endif  // CONFIG_VP9_ENCODER
+#endif  // HAVE_AVX2
+
 }  // namespace
--- a/test/simple_decoder.sh
+++ b/test/simple_decoder.sh
@@ -32,7 +32,10 @@ simple_decoder() {
  local codec="$2"
  local output_file="${VPX_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw"

-  [ -x "${decoder}" ] || return 1
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}

--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -29,7 +29,10 @@ simple_encoder() {
  local codec="$1"
  local output_file="${VPX_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf"

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -31,7 +31,6 @@ class SvcTest : public ::testing::Test {
  SvcTest()
      : codec_iface_(0),
        test_file_name_("hantro_collage_w352h288.yuv"),
-        stats_file_name_("hantro_collage_w352h288.stat"),
        codec_initialized_(false),
        decoder_(0) {
    memset(&svc_, 0, sizeof(svc_));
@@ -42,7 +41,6 @@ class SvcTest : public ::testing::Test {
  virtual ~SvcTest() {}

  virtual void SetUp() {
-    svc_.encoding_mode = INTER_LAYER_PREDICTION_IP;
    svc_.log_level = SVC_LOG_DEBUG;
    svc_.log_print = 0;

@@ -74,7 +72,6 @@ class SvcTest : public ::testing::Test {
  struct vpx_codec_enc_cfg codec_enc_;
  vpx_codec_iface_t *codec_iface_;
  std::string test_file_name_;
-  std::string stats_file_name_;
  bool codec_initialized_;
  Decoder *decoder_;
 };
@@ -133,22 +130,13 @@ TEST_F(SvcTest, SetLayersOption) {
  EXPECT_EQ(3, svc_.spatial_layers);
 }

-TEST_F(SvcTest, SetEncodingMode) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-  EXPECT_EQ(ALT_INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
-}
-
 TEST_F(SvcTest, SetMultipleOptions) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_OK, res);
  codec_initialized_ = true;
  EXPECT_EQ(2, svc_.spatial_layers);
-  EXPECT_EQ(INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
 }

 TEST_F(SvcTest, SetScaleFactorsOption) {
@@ -179,48 +167,20 @@ TEST_F(SvcTest, SetQuantizersOption) {
  codec_initialized_ = true;
 }

-TEST_F(SvcTest, SetKeyFrameQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_,
-                                       "quantizers-keyframe=not-quantizers");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "quantizers-keyframe=40,45");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
 TEST_F(SvcTest, SetQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30", 0);
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);

-  res = vpx_svc_set_quantizers(&svc_, NULL, 0);
+  res = vpx_svc_set_quantizers(&svc_, NULL);
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);

  svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40", 0);
+  res = vpx_svc_set_quantizers(&svc_, "40");
  EXPECT_EQ(VPX_CODEC_OK, res);
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);

-  res = vpx_svc_set_quantizers(&svc_, "40,30", 0);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
-TEST_F(SvcTest, SetKeyFrameQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,31", 1);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, NULL, 1);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, "40,30", 1);
+  res = vpx_svc_set_quantizers(&svc_, "40,30");
  EXPECT_EQ(VPX_CODEC_OK, res);
  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  EXPECT_EQ(VPX_CODEC_OK, res);
@@ -251,7 +211,7 @@ TEST_F(SvcTest, SetScaleFactors) {
 TEST_F(SvcTest, FirstFrameHasLayers) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -267,9 +227,17 @@ TEST_F(SvcTest, FirstFrameHasLayers) {
                       video.duration(), VPX_DL_GOOD_QUALITY);
  EXPECT_EQ(VPX_CODEC_OK, res);

+  if (vpx_svc_get_frame_size(&svc_) == 0) {
+    // Flush encoder
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+  }
+
+  int frame_size = vpx_svc_get_frame_size(&svc_);
+  EXPECT_GT(frame_size, 0);
  const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);

  // this test fails with a decoder error
  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
@@ -278,7 +246,10 @@ TEST_F(SvcTest, FirstFrameHasLayers) {
 TEST_F(SvcTest, EncodeThreeFrames) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -293,13 +264,14 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  // This frame is a keyframe.
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));

-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 1
  video.Next();
@@ -307,12 +279,14 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 2
  video.Next();
@@ -320,18 +294,35 @@ TEST_F(SvcTest, EncodeThreeFrames) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }

 TEST_F(SvcTest, GetLayerResolution) {
  svc_.spatial_layers = 2;
  vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -364,11 +355,13 @@ TEST_F(SvcTest, GetLayerResolution) {
  EXPECT_EQ(kHeight * 8 / 16, layer_height);
 }

-TEST_F(SvcTest, FirstPassEncode) {
+TEST_F(SvcTest, TwoPassEncode) {
+  // First pass encode
+  std::string stats_buf;
  svc_.spatial_layers = 2;
  codec_enc_.g_pass = VPX_RC_FIRST_PASS;
  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");

  vpx_codec_err_t res =
      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -383,62 +376,61 @@ TEST_F(SvcTest, FirstPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
+  size_t stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+  EXPECT_GT(stats_size, 0U);
+  const char *stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+  ASSERT_TRUE(stats_data != NULL);
+  stats_buf.append(stats_data, stats_size);

  // FRAME 1
  video.Next();
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
+  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+  EXPECT_GT(stats_size, 0U);
+  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+  ASSERT_TRUE(stats_data != NULL);
+  stats_buf.append(stats_data, stats_size);

  // Flush encoder and test EOS packet
  res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
-}
+  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+  EXPECT_GT(stats_size, 0U);
+  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+  ASSERT_TRUE(stats_data != NULL);
+  stats_buf.append(stats_data, stats_size);

-TEST_F(SvcTest, SecondPassEncode) {
-  svc_.spatial_layers = 2;
+  // Tear down encoder
+  vpx_svc_release(&svc_);
+  vpx_codec_destroy(&codec_);
+
+  // Second pass encode
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;
  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
+  codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();

-  FILE *const stats_file = libvpx_test::OpenTestDataFile(stats_file_name_);
-  ASSERT_TRUE(stats_file != NULL) << "Stats file open failed. Filename: "
-      << stats_file;
-
-  struct vpx_fixed_buf stats_buf;
-  fseek(stats_file, 0, SEEK_END);
-  stats_buf.sz = static_cast<size_t>(ftell(stats_file));
-  fseek(stats_file, 0, SEEK_SET);
-
-  stats_buf.buf = malloc(stats_buf.sz);
-  ASSERT_TRUE(stats_buf.buf != NULL);
-  const size_t bytes_read = fread(stats_buf.buf, 1, stats_buf.sz, stats_file);
-  ASSERT_EQ(bytes_read, stats_buf.sz);
-  fclose(stats_file);
-  codec_enc_.rc_twopass_stats_in = stats_buf;
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
  ASSERT_EQ(VPX_CODEC_OK, res);
  codec_initialized_ = true;

-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
  // FRAME 0
  video.Begin();
  // This frame is a keyframe.
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));

-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 1
  video.Next();
@@ -446,12 +438,14 @@ TEST_F(SvcTest, SecondPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

  // FRAME 2
  video.Next();
@@ -459,14 +453,29 @@ TEST_F(SvcTest, SecondPassEncode) {
  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                       video.duration(), VPX_DL_GOOD_QUALITY);
  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));

-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }

-  free(stats_buf.buf);
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }

 }  // namespace
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -1,6 +1,20 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
-998cec53307c94aa5835aaf8d5731f6a3c7c2e5a  hantro_collage_w352h288.stat
 b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
+fe346136b9b8c1e6f6084cc106485706915795e4  invalid-vp90-01.webm
+25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01.webm.res
+d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02.webm
+2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612  invalid-vp90-03-v3.webm.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5  park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17  park_joy_90p_10_444.y4m
+614c32ae1eca391e867c70d19974f0d62664dd99  park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e  park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34  park_joy_90p_12_444.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c  park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947  park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03  park_joy_90p_8_444.y4m
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
@@ -530,8 +544,6 @@ b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
 7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
 bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
 f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
-495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
-65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
 0c83a1e414fde3bccd6dc451bbaee68e59974c76  vp90-2-07-frame_parallel.webm
 e5c2c9fb383e5bf3b563480adaeba5b7e3475ecd  vp90-2-07-frame_parallel.webm.md5
 086c7edcffd699ae7d99d710fd7e53b18910ca5b  vp90-2-08-tile_1x2_frame_parallel.webm
@@ -577,6 +589,8 @@ d48c5db1b0f8e60521a7c749696b8067886033a3  vp90-2-09-aq2.webm
 54638c38009198c38c8f3b25c182b709b6c1fd2e  vp90-2-09-lf_deltas.webm.md5
 510d95f3beb3b51c572611fdaeeece12277dac30  vp90-2-10-show-existing-frame.webm
 14d631096f4bfa2d71f7f739aec1448fb3c33bad  vp90-2-10-show-existing-frame.webm.md5
+d2feea7728e8d2c615981d0f47427a4a5a45d881  vp90-2-10-show-existing-frame2.webm
+5f7c7811baa3e4f03be1dd78c33971b727846821  vp90-2-10-show-existing-frame2.webm.md5
 b4318e75f73a6a08992c7326de2fb589c2a794c7  vp90-2-11-size-351x287.webm
 b3c48382cf7d0454e83a02497c229d27720f9e20  vp90-2-11-size-351x287.webm.md5
 8e0096475ea2535bac71d3e2fc09e0c451c444df  vp90-2-11-size-351x288.webm
@@ -639,4 +653,20 @@ e615575ded499ea1d992f3b38e3baa434509cdcd  vp90-2-15-segkey.webm
 e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
 9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
 8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
-
+0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
+367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
+76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
+83f50908c8dc0ef8760595447a2ff7727489542e  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+c123d1f9f02fb4143abb5e271916e3a3080de8f6  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+f97088c7359fc8d3d5aa5eafe57bc7308b3ee124  vp90-2-20-big_superframe-01.webm
+47d7d409785afa33b123376de0c907336e6c7bd7  vp90-2-20-big_superframe-01.webm.md5
+65ade6d2786209582c50d34cfe22b3cdb033abaf  vp90-2-20-big_superframe-02.webm
+7c0ed8d04c4d06c5411dd2e5de2411d37f092db5  vp90-2-20-big_superframe-02.webm.md5
+667ec8718c982aef6be07eb94f083c2efb9d2d16  vp90-2-07-frame_parallel-1.webm
+bfc82bf848e9c05020d61e3ffc1e62f25df81d19  vp90-2-07-frame_parallel-1.webm.md5
+efd5a51d175cfdacd169ed23477729dc558030dc  invalid-vp90-2-07-frame_parallel-1.webm
+9f912712ec418be69adb910e2ca886a63c4cec08  invalid-vp90-2-07-frame_parallel-2.webm
+445f5a53ca9555341852997ccdd480a51540bd14  invalid-vp90-2-07-frame_parallel-3.webm
--- a/test/test.mk
+++ b/test/test.mk
@@ -15,7 +15,7 @@ LIBVPX_TEST_SRCS-yes += video_source.h
 ##
 ## Black box tests only use the public API.
 ##
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
@@ -30,6 +30,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
@@ -41,6 +43,9 @@ LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h

+## Y4m parsing.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h
+
 ## WebM Parsing
 ifeq ($(CONFIG_WEBM_IO), yes)
 LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
@@ -54,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 endif

+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc

 # Currently we only support decoder perf tests for vp9. Also they read from WebM
@@ -131,9 +137,20 @@ endif # CONFIG_SHARED
 ## TEST DATA
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.stat
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
+
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
@@ -667,6 +684,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
@@ -691,6 +710,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
@@ -705,8 +726,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
@@ -755,6 +774,29 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm.md5
+
+# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm

 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -12,6 +12,7 @@
 #include <cstdlib>
 #include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "../tools_common.h"
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -26,10 +27,24 @@

 namespace {

+enum DecodeMode {
+  kSerialMode,
+  kFrameParallMode
+};
+
+const int kDecodeMode = 0;
+const int kThreads = 1;
+const int kFileName = 2;
+
+typedef std::tr1::tuple<int, int, const char *> DecodeParam;
+
 class TestVectorTest : public ::libvpx_test::DecoderTest,
-    public ::libvpx_test::CodecTestWithParam<const char*> {
+    public ::libvpx_test::CodecTestWithParam<DecodeParam> {
 protected:
-  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {}
+  TestVectorTest()
+      : DecoderTest(GET_PARAM(0)),
+        md5_file_(NULL) {
+  }

  virtual ~TestVectorTest() {
    if (md5_file_)
@@ -71,8 +86,25 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 // checksums match the correct md5 data, then the test is passed. Otherwise,
 // the test failed.
 TEST_P(TestVectorTest, MD5Match) {
-  const std::string filename = GET_PARAM(1);
+  const DecodeParam input = GET_PARAM(1);
+  const std::string filename = std::tr1::get<kFileName>(input);
+  const int threads = std::tr1::get<kThreads>(input);
+  const int mode = std::tr1::get<kDecodeMode>(input);
  libvpx_test::CompressedVideoSource *video = NULL;
+  vpx_codec_flags_t flags = 0;
+  vpx_codec_dec_cfg_t cfg = {0};
+  char str[256];
+
+  if (mode == kFrameParallMode) {
+    flags |= VPX_CODEC_USE_FRAME_THREADING;
+  }
+
+  cfg.threads = threads;
+
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
+           "file: %s  mode: %s threads: %d",
+           filename.c_str(), mode == 0 ? "Serial" : "Parallel", threads);
+  SCOPED_TRACE(str);

  // Open compressed video file.
  if (filename.substr(filename.length() - 3, 3) == "ivf") {
@@ -92,18 +124,53 @@ TEST_P(TestVectorTest, MD5Match) {
  const std::string md5_filename = filename + ".md5";
  OpenMD5File(md5_filename);

+  // Set decode config and flags.
+  set_cfg(cfg);
+  set_flags(flags);
+
  // Decode frame, and check the md5 matching.
  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
  delete video;
 }

-VP8_INSTANTIATE_TEST_CASE(TestVectorTest,
-                          ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
-                                              libvpx_test::kVP8TestVectors +
-                                              libvpx_test::kNumVP8TestVectors));
-VP9_INSTANTIATE_TEST_CASE(TestVectorTest,
-                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
-                                              libvpx_test::kVP9TestVectors +
-                                              libvpx_test::kNumVP9TestVectors));
+// Test VP8 decode in serial mode with single thread.
+// NOTE: VP8 only support serial mode.
+INSTANTIATE_TEST_CASE_P(
+    VP8, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
+        ::testing::Combine(
+            ::testing::Values(0),  // Serial Mode.
+            ::testing::Values(1),  // Single thread.
+            ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
+                                libvpx_test::kVP8TestVectors +
+                                    libvpx_test::kNumVP8TestVectors))));

+// Test VP9 decode in serial mode with single thread.
+INSTANTIATE_TEST_CASE_P(
+    VP9, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Combine(
+            ::testing::Values(0),  // Serial Mode.
+            ::testing::Values(1),  // Single thread.
+            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                                libvpx_test::kVP9TestVectors +
+                                    libvpx_test::kNumVP9TestVectors))));
+
+
+// Test VP9 decode in frame parallel mode with different number of threads.
+INSTANTIATE_TEST_CASE_P(
+    VP9MultiThreadedFrameParallel, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Combine(
+            ::testing::Values(1),        // Frame Parallel mode.
+            ::testing::Range(2, 9),      // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                                libvpx_test::kVP9TestVectors +
+                                    libvpx_test::kNumVP9TestVectors))));
 }  // namespace
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -154,17 +154,19 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
  "vp90-2-05-resize.ivf", "vp90-2-06-bilinear.webm",
-  "vp90-2-07-frame_parallel.webm", "vp90-2-08-tile_1x2_frame_parallel.webm",
+  "vp90-2-07-frame_parallel.webm", "vp90-2-07-frame_parallel-1.webm",
+  "vp90-2-08-tile_1x2_frame_parallel.webm",
  "vp90-2-08-tile_1x2.webm", "vp90-2-08-tile_1x4_frame_parallel.webm",
  "vp90-2-08-tile_1x4.webm", "vp90-2-08-tile_1x8_frame_parallel.webm",
  "vp90-2-08-tile_1x8.webm", "vp90-2-08-tile-4x4.webm",
  "vp90-2-08-tile-4x1.webm", "vp90-2-09-subpixel-00.ivf",
  "vp90-2-02-size-lf-1920x1080.webm", "vp90-2-09-aq2.webm",
  "vp90-2-09-lf_deltas.webm", "vp90-2-10-show-existing-frame.webm",
+  "vp90-2-10-show-existing-frame2.webm",
  "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
  "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
  "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
-  "vp90-2-13-largescaling.webm", "vp91-2-04-yv444.webm",
+  "vp90-2-13-largescaling.webm",
  "vp90-2-14-resize-fp-tiles-1-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",
@@ -178,7 +180,9 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-14-resize-fp-tiles-4-2.webm", "vp90-2-14-resize-fp-tiles-4-8.webm",
  "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
-  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm"
+  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
+  "vp91-2-04-yuv444.webm",
+  "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -17,6 +17,10 @@ VPX_TEST_TOOLS_COMMON_SH=included
 set -e
 devnull='> /dev/null 2>&1'

+elog() {
+  echo "$@" 1>&2
+}
+
 vlog() {
  if [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
    echo "$@"
@@ -456,10 +460,19 @@ vlog "$(basename "${0%.*}") test configuration:
  LIBVPX_BIN_PATH=${LIBVPX_BIN_PATH}
  LIBVPX_CONFIG_PATH=${LIBVPX_CONFIG_PATH}
  LIBVPX_TEST_DATA_PATH=${LIBVPX_TEST_DATA_PATH}
-  VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}
-  VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
+  VP8_IVF_FILE=${VP8_IVF_FILE}
+  VP9_IVF_FILE=${VP9_IVF_FILE}
+  VP9_WEBM_FILE=${VP9_WEBM_FILE}
+  VPX_TEST_EXE_SUFFIX=${VPX_TEST_EXE_SUFFIX}
  VPX_TEST_FILTER=${VPX_TEST_FILTER}
+  VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}
+  VPX_TEST_RAND=${VPX_TEST_RAND}
  VPX_TEST_RUN_DISABLED_TESTS=${VPX_TEST_RUN_DISABLED_TESTS}
-  VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}"
+  VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}
+  VPX_TEST_TEMP_ROOT=${VPX_TEST_TEMP_ROOT}
+  VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
+  YUV_RAW_INPUT=${YUV_RAW_INPUT}
+  YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
+  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}"

 fi  # End $VPX_TEST_TOOLS_COMMON_SH pseudo include guard.
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -29,7 +29,10 @@ twopass_encoder() {
  local codec="$1"
  local output_file="${VPX_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vp8.h"
+
+namespace {
+
+using std::string;
+using libvpx_test::ACMRandom;
+
+#if CONFIG_WEBM_IO
+
+void CheckUserPrivateData(void *user_priv, int *target) {
+  // actual pointer value should be the same as expected.
+  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv) <<
+      "user_priv pointer value does not match.";
+}
+
+// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
+// compares the user_priv from return img with the original user_priv to see if
+// they match. Both the pointer values and the values inside the addresses
+// should match.
+string DecodeFile(const string &filename) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  int frame_num = 0;
+  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
+       video.Next()) {
+    void *user_priv = reinterpret_cast<void *>(&frame_num);
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size(),
+                            (frame_num == 0) ? NULL : user_priv);
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data.
+    while ((img = dec_iter.Next())) {
+      if (frame_num == 0) {
+        CheckUserPrivateData(img->user_priv, NULL);
+      } else {
+        CheckUserPrivateData(img->user_priv, &frame_num);
+
+        // Also test ctrl_get_reference api.
+        struct vp9_ref_frame ref;
+        // Randomly fetch a reference frame.
+        ref.idx = rnd.Rand8() % 3;
+        decoder.Control(VP9_GET_REFERENCE, &ref);
+
+        CheckUserPrivateData(ref.img.user_priv, NULL);
+      }
+      md5.Add(img);
+    }
+
+    frame_num++;
+  }
+  return string(md5.Get());
+}
+
+TEST(UserPrivTest, VideoDecode) {
+  // no tiles or frame parallel; this exercises the decoding to test the
+  // user_priv.
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("vp90-2-03-size-226x226.webm").c_str());
+}
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -702,6 +702,57 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
 #endif
+
+#if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+unsigned int vp9_sub_pixel_variance32x32_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vp9_sub_pixel_variance64x64_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vp9_sub_pixel_avg_variance32x32_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred);
+unsigned int vp9_sub_pixel_avg_variance64x64_avx2(
+    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred);
+}
+const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
+const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
+const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
+const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2;
+const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VP9VarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_avx2),
+                      make_tuple(5, 4, variance32x16_avx2),
+                      make_tuple(5, 5, variance32x32_avx2),
+                      make_tuple(6, 5, variance64x32_avx2),
+                      make_tuple(6, 6, variance64x64_avx2)));
+
+const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
+    vp9_sub_pixel_variance32x32_avx2;
+const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
+    vp9_sub_pixel_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
+                      make_tuple(6, 6, subpel_variance64x64_avx2)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
+    vp9_sub_pixel_avg_variance32x32_avx2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
+    vp9_sub_pixel_avg_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AVX2, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
+                      make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
+#endif  // HAVE_AVX2
 #endif  // CONFIG_VP9_ENCODER

 }  // namespace vp9
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -50,6 +50,15 @@ static FILE *OpenTestDataFile(const std::string& file_name) {
  return fopen(path_to_source.c_str(), "rb");
 }

+static FILE *OpenTestOutFile(const std::string& file_name) {
+  const std::string path_to_source = GetDataPath() + "/" + file_name;
+  return fopen(path_to_source.c_str(), "wb");
+}
+
+static FILE *OpenTempOutFile() {
+  return tmpfile();
+}
+
 // Abstract base class for test video sources, which provide a stream of
 // vpx_image_t images with associated timestamps and duration.
 class VideoSource {
--- a/test/vp8cx_set_ref.sh
+++ b/test/vp8cx_set_ref.sh
@@ -34,7 +34,10 @@ vpx_set_ref() {
  local output_file="${VPX_TEST_OUTPUT_DIR}/vp8cx_set_ref_${codec}.ivf"
  local ref_frame_num=90

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
      "${YUV_RAW_INPUT}" "${output_file}" "${ref_frame_num}" \
--- a/test/vp9_frame_parallel_test.cc
+++ b/test/vp9_frame_parallel_test.cc
@@ -0,0 +1,208 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using std::string;
+
+#if CONFIG_WEBM_IO
+
+struct FileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include skipped frames.
+  const char *expected_md5;
+  const int pause_frame_num;
+};
+
+// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
+// seek to next key frame and then continue decoding until the end. Return
+// the md5 of the decoded frames which does not include skipped frames.
+string DecodeFile(const string &filename, int num_threads, int pause_num) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+  int in_frames = 0;
+  int out_frames = 0;
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = num_threads;
+  vpx_codec_flags_t flags = 0;
+  flags |= VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  do {
+    ++in_frames;
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    // Pause at specified frame number.
+    if (in_frames == pause_num) {
+      // Flush the decoder and then seek to next key frame.
+      decoder.DecodeFrame(NULL, 0);
+      video.SeekToNextKeyFrame();
+    } else {
+      video.Next();
+    }
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(in_frames, out_frames) <<
+      "Input frame count does not match output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeFile(iter->name, t, iter->pause_frame_num))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
+  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+  // one key frame for every ten frames.
+  static const FileList files[] = {
+    { "vp90-2-07-frame_parallel-1.webm",
+      "6ea7c3875d67252e7caf2bc6e75b36b1", 6},
+    { "vp90-2-07-frame_parallel-1.webm",
+      "4bb634160c7356a8d7d4299b6dc83a45", 12},
+    { "vp90-2-07-frame_parallel-1.webm",
+      "89772591e6ef461f9fa754f916c78ed8", 26},
+    { NULL, NULL, 0},
+  };
+  DecodeFiles(files);
+}
+
+struct InvalidFileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include corrupted frames.
+  const char *expected_md5;
+  // Expected number of decoded frames which does not include corrupted frames.
+  const int expected_frame_count;
+};
+
+// Decodes |filename| with |num_threads|. Return the md5 of the decoded
+// frames which does not include corrupted frames.
+string DecodeInvalidFile(const string &filename, int num_threads,
+                         int expected_frame_count) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = num_threads;
+  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  int out_frames = 0;
+  do {
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    // TODO(hkuang): frame parallel mode should return an error on corruption.
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    video.Next();
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(expected_frame_count, out_frames) <<
+      "Input frame count does not match expected output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeInvalidFiles(const InvalidFileList files[]) {
+  for (const InvalidFileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeInvalidFile(iter->name, t, iter->expected_frame_count))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
+  static const InvalidFileList files[] = {
+    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 11th frame has corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-1.webm",
+      "0549d0f45f60deaef8eb708e6c0eb6cb", 30},
+    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 1st and 31st frames have
+    // corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-2.webm",
+      "6a1f3cf6f9e7a364212fadb9580d525e", 20},
+    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 13th frame has corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-3.webm",
+      "a567c8259d27ad32b1b7f58db5ac89dd", 32},
+    { NULL, NULL, 0},
+  };
+  DecodeInvalidFiles(files);
+}
+
+#endif  // CONFIG_WEBM_IO
+}  // namespace
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -36,6 +36,17 @@ class LosslessTestLarge : public ::libvpx_test::EncoderTest,
    SetMode(encoding_mode_);
  }

+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      // Only call Control if quantizer > 0 to verify that using quantizer
+      // alone will activate lossless
+      if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
+        encoder->Control(VP9E_SET_LOSSLESS, 1);
+      }
+    }
+  }
+
  virtual void BeginPassHook(unsigned int /*pass*/) {
    psnr_ = kMaxPsnr;
    nframes_ = 0;
@@ -91,5 +102,24 @@ TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
  EXPECT_GE(psnr_lossless, kMaxPsnr);
 }

+TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 10);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
 VP9_INSTANTIATE_TEST_CASE(LosslessTestLarge, ALL_TEST_MODES);
 }  // namespace
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh
@@ -34,7 +34,10 @@ vp9_spatial_svc_encoder() {

  shift

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" -h "${YUV_RAW_INPUT_HEIGHT}" \
      -k "${max_kf}" -f "${frames_to_encode}" "$@" "${YUV_RAW_INPUT}" \
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -18,7 +18,7 @@
 #if CONFIG_WEBM_IO
 #include "test/webm_video_source.h"
 #endif
-#include "vp9/decoder/vp9_thread.h"
+#include "vp9/common/vp9_thread.h"

 namespace {

@@ -28,11 +28,11 @@ class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> {
 protected:
  virtual ~VP9WorkerThreadTest() {}
  virtual void SetUp() {
-    vp9_worker_init(&worker_);
+    vp9_get_worker_interface()->init(&worker_);
  }

  virtual void TearDown() {
-    vp9_worker_end(&worker_);
+    vp9_get_worker_interface()->end(&worker_);
  }

  VP9Worker worker_;
@@ -45,10 +45,11 @@ int ThreadHook(void* data, void* return_value) {
 }

 TEST_P(VP9WorkerThreadTest, HookSuccess) {
-  EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.
+  // should be a no-op.
+  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);

  for (int i = 0; i < 2; ++i) {
-    EXPECT_NE(vp9_worker_reset(&worker_), 0);
+    EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);

    int hook_data = 0;
    int return_value = 1;  // return successfully from the hook
@@ -58,20 +59,21 @@ TEST_P(VP9WorkerThreadTest, HookSuccess) {

    const bool synchronous = GetParam();
    if (synchronous) {
-      vp9_worker_execute(&worker_);
+      vp9_get_worker_interface()->execute(&worker_);
    } else {
-      vp9_worker_launch(&worker_);
+      vp9_get_worker_interface()->launch(&worker_);
    }
-    EXPECT_NE(vp9_worker_sync(&worker_), 0);
+    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
    EXPECT_FALSE(worker_.had_error);
    EXPECT_EQ(5, hook_data);

-    EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.
+    // should be a no-op.
+    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
  }
 }

 TEST_P(VP9WorkerThreadTest, HookFailure) {
-  EXPECT_NE(vp9_worker_reset(&worker_), 0);
+  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);

  int hook_data = 0;
  int return_value = 0;  // return failure from the hook
@@ -81,26 +83,49 @@ TEST_P(VP9WorkerThreadTest, HookFailure) {

  const bool synchronous = GetParam();
  if (synchronous) {
-    vp9_worker_execute(&worker_);
+    vp9_get_worker_interface()->execute(&worker_);
  } else {
-    vp9_worker_launch(&worker_);
+    vp9_get_worker_interface()->launch(&worker_);
  }
-  EXPECT_FALSE(vp9_worker_sync(&worker_));
+  EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_));
  EXPECT_EQ(1, worker_.had_error);

  // Ensure _reset() clears the error and _launch() can be called again.
  return_value = 1;
-  EXPECT_NE(vp9_worker_reset(&worker_), 0);
+  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
  EXPECT_FALSE(worker_.had_error);
-  vp9_worker_launch(&worker_);
-  EXPECT_NE(vp9_worker_sync(&worker_), 0);
+  vp9_get_worker_interface()->launch(&worker_);
+  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
  EXPECT_FALSE(worker_.had_error);
 }

+TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
+  EXPECT_EQ(0, vp9_set_worker_interface(NULL));
+  EXPECT_TRUE(vp9_get_worker_interface() != NULL);
+  for (int i = 0; i < 6; ++i) {
+    VP9WorkerInterface winterface = *vp9_get_worker_interface();
+    switch (i) {
+      default:
+      case 0: winterface.init = NULL; break;
+      case 1: winterface.reset = NULL; break;
+      case 2: winterface.sync = NULL; break;
+      case 3: winterface.launch = NULL; break;
+      case 4: winterface.execute = NULL; break;
+      case 5: winterface.end = NULL; break;
+    }
+    EXPECT_EQ(0, vp9_set_worker_interface(&winterface));
+  }
+}
+
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests

 #if CONFIG_WEBM_IO
+struct FileList {
+  const char *name;
+  const char *expected_md5;
+};
+
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string& filename, int num_threads) {
  libvpx_test::WebMVideoSource video(filename);
@@ -130,39 +155,77 @@ string DecodeFile(const string& filename, int num_threads) {
  return string(md5.Get());
 }

-TEST(VP9DecodeMTTest, MTDecode) {
-  // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
-               DecodeFile("vp90-2-03-size-226x226.webm", 2).c_str());
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
+          << "threads = " << t;
+    }
+  }
 }

-TEST(VP9DecodeMTTest, MTDecode2) {
-  static const struct {
-    const char *name;
-    const char *expected_md5;
-  } files[] = {
+// Trivial serialized thread worker interface implementation.
+// Note any worker that requires synchronization between other workers will
+// hang.
+namespace impl {
+
+void Init(VP9Worker *const worker) { memset(worker, 0, sizeof(*worker)); }
+int Reset(VP9Worker *const /*worker*/) { return 1; }
+int Sync(VP9Worker *const worker) { return !worker->had_error; }
+
+void Execute(VP9Worker *const worker) {
+  worker->had_error |= !worker->hook(worker->data1, worker->data2);
+}
+
+void Launch(VP9Worker *const worker) { Execute(worker); }
+void End(VP9Worker *const /*worker*/) {}
+
+}  // namespace impl
+
+TEST(VP9WorkerThreadTest, TestSerialInterface) {
+  static const VP9WorkerInterface serial_interface = {
+    impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
+  };
+  // TODO(jzern): Avoid using a file that will use the row-based thread
+  // loopfilter, with the simple serialized implementation it will hang. This is
+  // due to its expectation that rows will be run in parallel as they wait on
+  // progress in the row above before proceeding.
+  static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
+  static const char filename[] = "vp90-2-03-size-226x226.webm";
+  VP9WorkerInterface default_interface = *vp9_get_worker_interface();
+
+  EXPECT_NE(vp9_set_worker_interface(&serial_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+
+  // Reset the interface.
+  EXPECT_NE(vp9_set_worker_interface(&default_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+}
+
+TEST(VP9DecodeMultiThreadedTest, Decode) {
+  // no tiles or frame parallel; this exercises loop filter threading.
+  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
+            DecodeFile("vp90-2-03-size-226x226.webm", 2));
+}
+
+TEST(VP9DecodeMultiThreadedTest, Decode2) {
+  static const FileList files[] = {
    { "vp90-2-08-tile_1x2_frame_parallel.webm",
      "68ede6abd66bae0a2edf2eb9232241b6" },
    { "vp90-2-08-tile_1x4_frame_parallel.webm",
      "368ebc6ebf3a5e478d85b2c3149b2848" },
    { "vp90-2-08-tile_1x8_frame_parallel.webm",
      "17e439da2388aff3a0f69cb22579c6c1" },
+    { NULL, NULL }
  };

-  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
-          << "threads = " << t;
-    }
-  }
+  DecodeFiles(files);
 }

 // Test tile quantity changes within one file.
-TEST(VP9DecodeMTTest, MTDecode3) {
-  static const struct {
-    const char *name;
-    const char *expected_md5;
-  } files[] = {
+TEST(VP9DecodeMultiThreadedTest, Decode3) {
+  static const FileList files[] = {
    { "vp90-2-14-resize-fp-tiles-1-16.webm",
      "0cd5e632c326297e975f38949c31ea94" },
    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
@@ -207,14 +270,10 @@ TEST(VP9DecodeMTTest, MTDecode3) {
      "ae96f21f21b6370cc0125621b441fc52" },
    { "vp90-2-14-resize-fp-tiles-8-4.webm",
      "3eb4f24f10640d42218f7fd7b9fd30d4" },
+    { NULL, NULL }
  };

-  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
-          << "threads = " << t;
-    }
-  }
+  DecodeFiles(files);
 }
 #endif  // CONFIG_WEBM_IO

--- a/test/vpx_temporal_svc_encoder.sh
+++ b/test/vpx_temporal_svc_encoder.sh
@@ -39,7 +39,10 @@ vpx_tsvc_encoder() {

  shift 2

-  [ -x "${encoder}" ] || return 1
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi

  eval "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" "${codec}" \
      "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -69,6 +69,18 @@ class WebMVideoSource : public CompressedVideoSource {
    }
  }

+  void SeekToNextKeyFrame() {
+    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    do {
+      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+      ASSERT_GE(status, 0) << "webm_read_frame failed";
+      ++frame_;
+      if (status == 1) {
+        end_of_file_ = true;
+      }
+    } while (!webm_ctx_->is_key_frame && !end_of_file_);
+  }
+
  virtual const uint8_t *cxdata() const {
    return end_of_file_ ? NULL : buf_;
  }
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "./y4menc.h"
+
+namespace {
+
+using std::string;
+using std::tr1::make_tuple;
+
+static const unsigned int kWidth  = 160;
+static const unsigned int kHeight = 90;
+static const unsigned int kFrames = 10;
+
+typedef std::tr1::tuple<const char *, const unsigned int,
+        const vpx_img_fmt, const char *> test_entry_type;
+
+static const test_entry_type kY4mTestVectors[] = {
+  make_tuple("park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420,
+             "e5406275b9fc6bb3436c31d4a05c1cab"),
+  make_tuple("park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422,
+             "284a47a47133b12884ec3a14e959a0b6"),
+  make_tuple("park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444,
+             "90517ff33843d85de712fd4fe60dbed0"),
+  make_tuple("park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016,
+             "63f21f9f717d8b8631bd2288ee87137b"),
+  make_tuple("park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216,
+             "48ab51fb540aed07f7ff5af130c9b605"),
+  make_tuple("park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416,
+             "067bfd75aa85ff9bae91fa3e0edd1e3e"),
+  make_tuple("park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016,
+             "9e6d8f6508c6e55625f6b697bc461cef"),
+  make_tuple("park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216,
+             "b239c6b301c0b835485be349ca83a7e3"),
+  make_tuple("park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416,
+             "5a6481a550821dab6d0192f5c63845e9")
+};
+
+static void write_image_file(const vpx_image_t *img, FILE *file) {
+  int plane, y;
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+    const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
+                   img->y_chroma_shift : img->d_h);
+    const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                   img->x_chroma_shift : img->d_w);
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, bytes_per_sample, w, file);
+      buf += stride;
+    }
+  }
+}
+
+class Y4mVideoSourceTest
+    : public ::testing::TestWithParam<test_entry_type>,
+      public ::libvpx_test::Y4mVideoSource {
+ protected:
+  Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
+
+  virtual ~Y4mVideoSourceTest() {
+    CloseSource();
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    file_name_ = file_name;
+    start_ = 0;
+    limit_ = limit;
+    frame_ = 0;
+    Begin();
+  }
+
+  // Checks y4m header information
+  void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_EQ(y4m_.pic_w, (int)kWidth);
+    ASSERT_EQ(y4m_.pic_h, (int)kHeight);
+    ASSERT_EQ(img()->d_w, kWidth);
+    ASSERT_EQ(img()->d_h, kHeight);
+    ASSERT_EQ(y4m_.bit_depth, bit_depth);
+    ASSERT_EQ(y4m_.vpx_fmt, fmt);
+    if (fmt == VPX_IMG_FMT_I420 || fmt == VPX_IMG_FMT_I42016) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 1U);
+    }
+    if (fmt == VPX_IMG_FMT_I422 || fmt == VPX_IMG_FMT_I42216) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+    if (fmt == VPX_IMG_FMT_I444 || fmt == VPX_IMG_FMT_I44416) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
+      ASSERT_EQ(img()->x_chroma_shift, 0U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+  }
+
+  // Checks MD5 of the raw frame data
+  void Md5Check(const string &expected_md5) {
+    ASSERT_TRUE(input_file_ != NULL);
+    libvpx_test::MD5 md5;
+    for (unsigned int i = start_; i < limit_; i++) {
+      md5.Add(img());
+      Next();
+    }
+    ASSERT_EQ(string(md5.Get()), expected_md5);
+  }
+};
+
+TEST_P(Y4mVideoSourceTest, SourceTest) {
+  const char *filename = GET_PARAM(0);
+  const unsigned int bit_depth = GET_PARAM(1);
+  const vpx_img_fmt format = GET_PARAM(2);
+  const char *md5raw = GET_PARAM(3);
+
+  Init(filename, kFrames);
+  HeaderChecks(bit_depth, format);
+  Md5Check(md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+class Y4mVideoWriteTest
+    : public Y4mVideoSourceTest {
+ protected:
+  Y4mVideoWriteTest() : Y4mVideoSourceTest() {}
+
+  virtual void ReplaceInputFp(FILE *input_file) {
+    CloseSource();
+    frame_ = 0;
+    input_file_ = input_file;
+    rewind(input_file_);
+    ReadSourceToStart();
+  }
+
+  // Writes out a y4m file and then reads it back
+  void WriteY4mAndReadBack() {
+    ASSERT_TRUE(input_file_ != NULL);
+    char buf[Y4M_BUFFER_SIZE] = {0};
+    const struct VpxRational framerate = {y4m_.fps_n, y4m_.fps_d};
+    FILE *out_file = libvpx_test::OpenTempOutFile();
+    ASSERT_TRUE(out_file != NULL);
+    y4m_write_file_header(buf, sizeof(buf),
+                          kWidth, kHeight,
+                          &framerate, y4m_.vpx_fmt,
+                          y4m_.bit_depth);
+    fputs(buf, out_file);
+    for (unsigned int i = start_; i < limit_; i++) {
+      y4m_write_frame_header(buf, sizeof(buf));
+      fputs(buf, out_file);
+      write_image_file(img(), out_file);
+      Next();
+    }
+    ReplaceInputFp(out_file);
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    Y4mVideoSourceTest::Init(file_name, limit);
+    WriteY4mAndReadBack();
+  }
+};
+
+TEST_P(Y4mVideoWriteTest, WriteTest) {
+  const char *filename = GET_PARAM(0);
+  const unsigned int bit_depth = GET_PARAM(1);
+  const vpx_img_fmt format = GET_PARAM(2);
+  const char *md5raw = GET_PARAM(3);
+
+  Init(filename, kFrames);
+  HeaderChecks(bit_depth, format);
+  Md5Check(md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+}  // namespace
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -38,24 +38,30 @@ class Y4mVideoSource : public VideoSource {
    CloseSource();
  }

-  virtual void Begin() {
+  virtual void OpenSource() {
    CloseSource();
    input_file_ = OpenTestDataFile(file_name_);
    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-        << file_name_;
+                                     << file_name_;
+  }

-    y4m_input_open(&y4m_, input_file_, NULL, 0, 0);
+  virtual void ReadSourceToStart() {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
    framerate_numerator_ = y4m_.fps_n;
    framerate_denominator_ = y4m_.fps_d;
-
    frame_ = 0;
    for (unsigned int i = 0; i < start_; i++) {
-        Next();
+      Next();
    }
-
    FillFrame();
  }

+  virtual void Begin() {
+    OpenSource();
+    ReadSourceToStart();
+  }
+
  virtual void Next() {
    ++frame_;
    FillFrame();
--- a/third_party/libmkv/EbmlIDs.h
+++ b/third_party/libmkv/EbmlIDs.h
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef MKV_DEFS_HPP
+#define MKV_DEFS_HPP 1
+
+/* Commenting out values not available in webm, but available in matroska */
+
+enum mkv {
+  EBML = 0x1A45DFA3,
+  EBMLVersion = 0x4286,
+  EBMLReadVersion = 0x42F7,
+  EBMLMaxIDLength = 0x42F2,
+  EBMLMaxSizeLength = 0x42F3,
+  DocType = 0x4282,
+  DocTypeVersion = 0x4287,
+  DocTypeReadVersion = 0x4285,
+/* CRC_32 = 0xBF, */
+  Void = 0xEC,
+  SignatureSlot = 0x1B538667,
+  SignatureAlgo = 0x7E8A,
+  SignatureHash = 0x7E9A,
+  SignaturePublicKey = 0x7EA5,
+  Signature = 0x7EB5,
+  SignatureElements = 0x7E5B,
+  SignatureElementList = 0x7E7B,
+  SignedElement = 0x6532,
+  /* segment */
+  Segment = 0x18538067,
+  /* Meta Seek Information */
+  SeekHead = 0x114D9B74,
+  Seek = 0x4DBB,
+  SeekID = 0x53AB,
+  SeekPosition = 0x53AC,
+  /* Segment Information */
+  Info = 0x1549A966,
+/* SegmentUID = 0x73A4, */
+/* SegmentFilename = 0x7384, */
+/* PrevUID = 0x3CB923, */
+/* PrevFilename = 0x3C83AB, */
+/* NextUID = 0x3EB923, */
+/* NextFilename = 0x3E83BB, */
+/* SegmentFamily = 0x4444, */
+/* ChapterTranslate = 0x6924, */
+/* ChapterTranslateEditionUID = 0x69FC, */
+/* ChapterTranslateCodec = 0x69BF, */
+/* ChapterTranslateID = 0x69A5, */
+  TimecodeScale = 0x2AD7B1,
+  Segment_Duration = 0x4489,
+  DateUTC = 0x4461,
+/* Title = 0x7BA9, */
+  MuxingApp = 0x4D80,
+  WritingApp = 0x5741,
+  /* Cluster */
+  Cluster = 0x1F43B675,
+  Timecode = 0xE7,
+/* SilentTracks = 0x5854, */
+/* SilentTrackNumber = 0x58D7, */
+/* Position = 0xA7, */
+  PrevSize = 0xAB,
+  BlockGroup = 0xA0,
+  Block = 0xA1,
+/* BlockVirtual = 0xA2, */
+  BlockAdditions = 0x75A1,
+  BlockMore = 0xA6,
+  BlockAddID = 0xEE,
+  BlockAdditional = 0xA5,
+  BlockDuration = 0x9B,
+/* ReferencePriority = 0xFA, */
+  ReferenceBlock = 0xFB,
+/* ReferenceVirtual = 0xFD, */
+/* CodecState = 0xA4, */
+/* Slices = 0x8E, */
+/* TimeSlice = 0xE8, */
+  LaceNumber = 0xCC,
+/* FrameNumber = 0xCD, */
+/* BlockAdditionID = 0xCB, */
+/* MkvDelay = 0xCE, */
+/* Cluster_Duration = 0xCF, */
+  SimpleBlock = 0xA3,
+/* EncryptedBlock = 0xAF, */
+  /* Track */
+  Tracks = 0x1654AE6B,
+  TrackEntry = 0xAE,
+  TrackNumber = 0xD7,
+  TrackUID = 0x73C5,
+  TrackType = 0x83,
+  FlagEnabled = 0xB9,
+  FlagDefault = 0x88,
+  FlagForced = 0x55AA,
+  FlagLacing = 0x9C,
+/* MinCache = 0x6DE7, */
+/* MaxCache = 0x6DF8, */
+  DefaultDuration = 0x23E383,
+/* TrackTimecodeScale = 0x23314F, */
+/* TrackOffset = 0x537F, */
+  MaxBlockAdditionID = 0x55EE,
+  Name = 0x536E,
+  Language = 0x22B59C,
+  CodecID = 0x86,
+  CodecPrivate = 0x63A2,
+  CodecName = 0x258688,
+/* AttachmentLink = 0x7446, */
+/* CodecSettings = 0x3A9697, */
+/* CodecInfoURL = 0x3B4040, */
+/* CodecDownloadURL = 0x26B240, */
+/* CodecDecodeAll = 0xAA, */
+/* TrackOverlay = 0x6FAB, */
+/* TrackTranslate = 0x6624, */
+/* TrackTranslateEditionUID = 0x66FC, */
+/* TrackTranslateCodec = 0x66BF, */
+/* TrackTranslateTrackID = 0x66A5, */
+  /* video */
+  Video = 0xE0,
+  FlagInterlaced = 0x9A,
+  StereoMode = 0x53B8,
+  AlphaMode = 0x53C0,
+  PixelWidth = 0xB0,
+  PixelHeight = 0xBA,
+  PixelCropBottom = 0x54AA,
+  PixelCropTop = 0x54BB,
+  PixelCropLeft = 0x54CC,
+  PixelCropRight = 0x54DD,
+  DisplayWidth = 0x54B0,
+  DisplayHeight = 0x54BA,
+  DisplayUnit = 0x54B2,
+  AspectRatioType = 0x54B3,
+/* ColourSpace = 0x2EB524, */
+/* GammaValue = 0x2FB523, */
+  FrameRate = 0x2383E3,
+  /* end video */
+  /* audio */
+  Audio = 0xE1,
+  SamplingFrequency = 0xB5,
+  OutputSamplingFrequency = 0x78B5,
+  Channels = 0x9F,
+/* ChannelPositions = 0x7D7B, */
+  BitDepth = 0x6264,
+  /* end audio */
+  /* content encoding */
+/* ContentEncodings = 0x6d80, */
+/* ContentEncoding = 0x6240, */
+/* ContentEncodingOrder = 0x5031, */
+/* ContentEncodingScope = 0x5032, */
+/* ContentEncodingType = 0x5033, */
+/* ContentCompression = 0x5034, */
+/* ContentCompAlgo = 0x4254, */
+/* ContentCompSettings = 0x4255, */
+/* ContentEncryption = 0x5035, */
+/* ContentEncAlgo = 0x47e1, */
+/* ContentEncKeyID = 0x47e2, */
+/* ContentSignature = 0x47e3, */
+/* ContentSigKeyID = 0x47e4, */
+/* ContentSigAlgo = 0x47e5, */
+/* ContentSigHashAlgo = 0x47e6, */
+  /* end content encoding */
+  /* Cueing Data */
+  Cues = 0x1C53BB6B,
+  CuePoint = 0xBB,
+  CueTime = 0xB3,
+  CueTrackPositions = 0xB7,
+  CueTrack = 0xF7,
+  CueClusterPosition = 0xF1,
+  CueBlockNumber = 0x5378
+/* CueCodecState = 0xEA, */
+/* CueReference = 0xDB, */
+/* CueRefTime = 0x96, */
+/* CueRefCluster = 0x97, */
+/* CueRefNumber = 0x535F, */
+/* CueRefCodecState = 0xEB, */
+  /* Attachment */
+/* Attachments = 0x1941A469, */
+/* AttachedFile = 0x61A7, */
+/* FileDescription = 0x467E, */
+/* FileName = 0x466E, */
+/* FileMimeType = 0x4660, */
+/* FileData = 0x465C, */
+/* FileUID = 0x46AE, */
+/* FileReferral = 0x4675, */
+  /* Chapters */
+/* Chapters = 0x1043A770, */
+/* EditionEntry = 0x45B9, */
+/* EditionUID = 0x45BC, */
+/* EditionFlagHidden = 0x45BD, */
+/* EditionFlagDefault = 0x45DB, */
+/* EditionFlagOrdered = 0x45DD, */
+/* ChapterAtom = 0xB6, */
+/* ChapterUID = 0x73C4, */
+/* ChapterTimeStart = 0x91, */
+/* ChapterTimeEnd = 0x92, */
+/* ChapterFlagHidden = 0x98, */
+/* ChapterFlagEnabled = 0x4598, */
+/* ChapterSegmentUID = 0x6E67, */
+/* ChapterSegmentEditionUID = 0x6EBC, */
+/* ChapterPhysicalEquiv = 0x63C3, */
+/* ChapterTrack = 0x8F, */
+/* ChapterTrackNumber = 0x89, */
+/* ChapterDisplay = 0x80, */
+/* ChapString = 0x85, */
+/* ChapLanguage = 0x437C, */
+/* ChapCountry = 0x437E, */
+/* ChapProcess = 0x6944, */
+/* ChapProcessCodecID = 0x6955, */
+/* ChapProcessPrivate = 0x450D, */
+/* ChapProcessCommand = 0x6911, */
+/* ChapProcessTime = 0x6922, */
+/* ChapProcessData = 0x6933, */
+  /* Tagging */
+/* Tags = 0x1254C367, */
+/* Tag = 0x7373, */
+/* Targets = 0x63C0, */
+/* TargetTypeValue = 0x68CA, */
+/* TargetType = 0x63CA, */
+/* Tagging_TrackUID = 0x63C5, */
+/* Tagging_EditionUID = 0x63C9, */
+/* Tagging_ChapterUID = 0x63C4, */
+/* AttachmentUID = 0x63C6, */
+/* SimpleTag = 0x67C8, */
+/* TagName = 0x45A3, */
+/* TagLanguage = 0x447A, */
+/* TagDefault = 0x4484, */
+/* TagString = 0x4487, */
+/* TagBinary = 0x4485, */
+};
+#endif
--- a/third_party/libmkv/EbmlWriter.c
+++ b/third_party/libmkv/EbmlWriter.c
@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "EbmlWriter.h"
+#include <stdlib.h>
+#include <wchar.h>
+#include <string.h>
+#include <limits.h>
+#if defined(_MSC_VER)
+#define LITERALU64(n) n
+#else
+#define LITERALU64(n) n##LLU
+#endif
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
+  /* TODO check and make sure we are not > than 0x0100000000000000LLU */
+  unsigned char size = 8; /* size in bytes to output */
+
+  /* mask to compare for byte size */
+  int64_t minVal = 0xff;
+
+  for (size = 1; size < 8; size ++) {
+    if (val < minVal)
+      break;
+
+    minVal = (minVal << 7);
+  }
+
+  val |= (((uint64_t)0x80) << ((size - 1) * 7));
+
+  Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
+}
+
+void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
+  const size_t size_ = strlen(str);
+  const uint64_t  size = size_;
+  Ebml_WriteLen(glob, size);
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we omit the null terminator.
+   */
+  Ebml_Write(glob, str, (unsigned long)size);
+}
+
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
+  const size_t strlen = wcslen(wstr);
+
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we include it.
+   */
+  const uint64_t  size = strlen;
+
+  Ebml_WriteLen(glob, size);
+  Ebml_Write(glob, wstr, (unsigned long)size);
+}
+
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
+  int len;
+
+  if (class_id >= 0x01000000)
+    len = 4;
+  else if (class_id >= 0x00010000)
+    len = 3;
+  else if (class_id >= 0x00000100)
+    len = 2;
+  else
+    len = 1;
+
+  Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
+}
+
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
+  unsigned char sizeSerialized = 8 | 0x80;
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), 8);
+}
+
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
+  unsigned char size = 8; /* size in bytes to output */
+  unsigned char sizeSerialized = 0;
+  unsigned long minVal;
+
+  Ebml_WriteID(glob, class_id);
+  minVal = 0x7fLU; /* mask to compare for byte size */
+
+  for (size = 1; size < 4; size ++) {
+    if (ui < minVal) {
+      break;
+    }
+
+    minVal <<= 7;
+  }
+
+  sizeSerialized = 0x80 | size;
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), size);
+}
+/* TODO: perhaps this is a poor name for this id serializer helper function */
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
+  int size;
+  for (size = 4; size > 1; size--) {
+    if (bin & (unsigned int)0x000000ff << ((size - 1) * 8))
+      break;
+  }
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, size);
+  Ebml_WriteID(glob, bin);
+}
+
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) {
+  unsigned char len = 0x88;
+
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &len, sizeof(len), 1);
+  Ebml_Serialize(glob,  &d, sizeof(d), 8);
+}
+
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val) {
+  signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
+  Ebml_Serialize(glob, &out, sizeof(out), 3);
+}
+
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteString(glob, s);
+}
+
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) {
+  Ebml_WriteID(glob,  class_id);
+  Ebml_WriteUTF8(glob,  s);
+}
+
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, data_length);
+  Ebml_Write(glob,  data, data_length);
+}
+
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
+  unsigned char tmp = 0;
+  unsigned long i = 0;
+
+  Ebml_WriteID(glob, 0xEC);
+  Ebml_WriteLen(glob, vSize);
+
+  for (i = 0; i < vSize; i++) {
+    Ebml_Write(glob, &tmp, 1);
+  }
+}
+
+/* TODO Serialize Date */
--- a/third_party/libmkv/EbmlWriter.h
+++ b/third_party/libmkv/EbmlWriter.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef EBMLWRITER_HPP
+#define EBMLWRITER_HPP
+#include <stddef.h>
+#include "vpx/vpx_integer.h"
+
+/* note: you must define write and serialize functions as well as your own
+ * EBML_GLOBAL
+ *
+ * These functions MUST be implemented
+ */
+
+typedef struct EbmlGlobal EbmlGlobal;
+void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
+void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
+
+/*****/
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
+void Ebml_WriteString(EbmlGlobal *glob, const char *str);
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui);
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
+/* TODO make this more generic to signed */
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
+/* TODO need date function */
+#endif
--- a/tools_common.h
+++ b/tools_common.h
@@ -90,6 +90,7 @@ struct VpxInputContext {
  uint32_t width;
  uint32_t height;
  vpx_img_fmt_t fmt;
+  vpx_bit_depth_t bit_depth;
  int only_i420;
  uint32_t fourcc;
  struct VpxRational framerate;
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -393,12 +393,12 @@ void vp8_de_noise(VP8_COMMON                 *cm,
                  int                         low_var_thresh,
                  int                         flag)
 {
+    int mbr;
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
-    int mb_rows = source->y_width >> 4;
-    int mb_cols = source->y_height >> 4;
+    int mb_rows = cm->mb_rows;
+    int mb_cols = cm->mb_cols;
    unsigned char *limits = cm->pp_limits_buffer;;
-    int mbr, mbc;
    (void) post;
    (void) low_var_thresh;
    (void) flag;
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -463,9 +463,7 @@ $vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2/;
-# TODO(johann) Update sse4 implementation and re-enable
-#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1/;

 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
@@ -554,6 +552,9 @@ $vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
    add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
    specialize qw/vp8_denoiser_filter sse2 neon/;
+    add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+    specialize qw/vp8_denoiser_filter_uv sse2/;
+
 }

 # End of encoder only functions
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -246,7 +246,6 @@ sym(vp8_mbpost_proc_down_mmx):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
 global sym(vp8_plane_add_noise_mmx) PRIVATE
 sym(vp8_plane_add_noise_mmx):
    push        rbp
@@ -258,7 +257,7 @@ sym(vp8_plane_add_noise_mmx):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -660,7 +660,6 @@ sym(vp8_mbpost_proc_across_ip_xmm):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
 global sym(vp8_plane_add_noise_wmt) PRIVATE
 sym(vp8_plane_add_noise_wmt):
    push        rbp
@@ -672,7 +671,7 @@ sym(vp8_plane_add_noise_wmt):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp8/common/x86/postproc_x86.c
+++ b/vp8/common/x86/postproc_x86.c
@@ -1,24 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
-#if defined(__ANDROID__)
-#define rand __rand
-#include <stdlib.h>
-#undef rand
-
-extern int rand(void)
-{
-  return __rand();
-}
-#else
-/* ISO C forbids an empty translation unit. */
-int vp8_unused;
-#endif
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -18,18 +18,18 @@ extern "C" {

 #if HAVE_EDSP
 void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
-                             const vp8_token *,
+                             vp8_token *,
                             const vp8_extra_bit_struct *,
                             const vp8_tree_index *);
 void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
                                             unsigned char * cx_data,
                                             const unsigned char *cx_data_end,
                                             int num_parts,
-                                             const vp8_token *,
+                                             vp8_token *,
                                             const vp8_extra_bit_struct *,
                                             const vp8_tree_index *);
 void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
-                                    const vp8_token *,
+                                    vp8_token *,
                                    const vp8_extra_bit_struct *,
                                    const vp8_tree_index *);
 # define pack_tokens(a,b,c)                  \
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -191,10 +191,154 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
    return FILTER_BLOCK;
 }

-int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
+                             int mc_avg_uv_stride,
+                             unsigned char *running_avg_uv,
+                             int avg_uv_stride,
+                             unsigned char *sig,
+                             int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_uv_start = running_avg_uv;
+    unsigned char *sig_start = sig;
+    int sum_diff_thresh;
+    int r, c;
+    int sum_diff = 0;
+    int sum_block = 0;
+    int adj_val[3] = {3, 4, 6};
+    int shift_inc1 = 0;
+    int shift_inc2 = 1;
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. Add another increment for
+     * blocks that are labeled for increase denoising. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
+      if (increase_denoising) {
+        shift_inc1 = 1;
+        shift_inc2 = 2;
+      }
+      adj_val[0] += shift_inc2;
+      adj_val[1] += shift_inc2;
+      adj_val[2] += shift_inc2;
+    }
+
+    // Avoid denoising color signal if its close to average level.
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        sum_block += sig[c];
+      }
+      sig += sig_stride;
+    }
+    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+      return COPY_BLOCK;
+    }
+
+    sig -= sig_stride * 8;
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        int diff = 0;
+        int adjustment = 0;
+        int absdiff = 0;
+
+        diff = mc_running_avg_uv[c] - sig[c];
+        absdiff = abs(diff);
+
+        // When |diff| <= |3 + shift_inc1|, use pixel value from
+        // last denoised raw.
+        if (absdiff <= 3 + shift_inc1) {
+          running_avg_uv[c] = mc_running_avg_uv[c];
+          sum_diff += diff;
+        } else {
+          if (absdiff >= 4 && absdiff <= 7)
+            adjustment = adj_val[0];
+          else if (absdiff >= 8 && absdiff <= 15)
+            adjustment = adj_val[1];
+          else
+            adjustment = adj_val[2];
+          if (diff > 0) {
+            if ((sig[c] + adjustment) > 255)
+              running_avg_uv[c] = 255;
+            else
+              running_avg_uv[c] = sig[c] + adjustment;
+            sum_diff += adjustment;
+          } else {
+            if ((sig[c] - adjustment) < 0)
+              running_avg_uv[c] = 0;
+            else
+              running_avg_uv[c] = sig[c] - adjustment;
+            sum_diff -= adjustment;
+          }
+        }
+      }
+      /* Update pointers for next iteration. */
+      sig += sig_stride;
+      mc_running_avg_uv += mc_avg_uv_stride;
+      running_avg_uv += avg_uv_stride;
+    }
+
+    sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
+    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising), check
+      // if we can still apply some (weaker) temporal filtering to this block,
+      // that would otherwise not be denoised at all. Simplest is to apply
+      // an additional adjustment to running_avg_y to bring it closer to sig.
+      // The adjustment is capped by a maximum delta, and chosen such that
+      // in most cases the resulting sum_diff will be within the
+      // accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over threshold.
+      int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        sig -= sig_stride * 8;
+        mc_running_avg_uv -= mc_avg_uv_stride * 8;
+        running_avg_uv -= avg_uv_stride * 8;
+        for (r = 0; r < 8; ++r) {
+          for (c = 0; c < 8; ++c) {
+            int diff = mc_running_avg_uv[c] - sig[c];
+            int adjustment = abs(diff);
+            if (adjustment > delta)
+              adjustment = delta;
+            if (diff > 0) {
+              // Bring denoised signal down.
+              if (running_avg_uv[c] - adjustment < 0)
+                running_avg_uv[c] = 0;
+              else
+                running_avg_uv[c] = running_avg_uv[c] - adjustment;
+              sum_diff -= adjustment;
+            } else if (diff < 0) {
+              // Bring denoised signal up.
+              if (running_avg_uv[c] + adjustment > 255)
+                running_avg_uv[c] = 255;
+              else
+                running_avg_uv[c] = running_avg_uv[c] + adjustment;
+              sum_diff += adjustment;
+            }
+          }
+          // TODO(marpan): Check here if abs(sum_diff) has gone below the
+          // threshold sum_diff_thresh, and if so, we can exit the row loop.
+          sig += sig_stride;
+          mc_running_avg_uv += mc_avg_uv_stride;
+          running_avg_uv += avg_uv_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh)
+          return COPY_BLOCK;
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+
+    vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
+                    sig_stride);
+    return FILTER_BLOCK;
+}
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
+                          int num_mb_rows, int num_mb_cols)
 {
    int i;
    assert(denoiser);
+    denoiser->num_mb_cols = num_mb_cols;

    for (i = 0; i < MAX_REF_FRAMES; i++)
    {
@@ -222,6 +366,10 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)

    vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
               denoiser->yv12_mc_running_avg.frame_size);
+
+    denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
+    vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
+
    return 0;
 }

@@ -235,6 +383,7 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
        vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
    }
    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+    vpx_free(denoiser->denoise_state);
 }


@@ -243,17 +392,28 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                             unsigned int best_sse,
                             unsigned int zero_mv_sse,
                             int recon_yoffset,
-                             int recon_uvoffset)
+                             int recon_uvoffset,
+                             loop_filter_info_n *lfi_n,
+                             int mb_row,
+                             int mb_col,
+                             int block_index)
 {
    int mv_row;
    int mv_col;
    unsigned int motion_magnitude2;
    unsigned int sse_thresh;
    int sse_diff_thresh = 0;
+    // Denoise the UV channel.
+    int apply_color_denoise = 0;
+    // Spatial loop filter: only applied selectively based on
+    // temporal filter state of block relative to top/left neighbors.
+    int apply_spatial_loop_filter = 1;
    MV_REFERENCE_FRAME frame = x->best_reference_frame;
    MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;

    enum vp8_denoiser_decision decision = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_u = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_v = FILTER_BLOCK;

    if (zero_frame)
    {
@@ -263,7 +423,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
        MB_MODE_INFO saved_mbmi;
        MACROBLOCKD *filter_xd = &x->e_mbd;
        MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
-        int sse_diff = zero_mv_sse - best_sse;
+        int sse_diff = 0;
+        // Bias on zero motion vector sse.
+        int zero_bias = 95;
+        zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
+        sse_diff = zero_mv_sse - best_sse;

        saved_mbmi = *mbmi;

@@ -359,9 +523,37 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,

        /* Filter. */
        decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
-                                         running_avg_y, avg_y_stride,
-                                         x->thismb, 16, motion_magnitude2,
-                                         x->increase_denoising);
+                                       running_avg_y, avg_y_stride,
+                                       x->thismb, 16, motion_magnitude2,
+                                       x->increase_denoising);
+        denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
+            kFilterNonZeroMV : kFilterZeroMV;
+        // Only denoise UV for zero motion, and if y channel was denoised.
+        if (apply_color_denoise &&
+            motion_magnitude2 == 0 &&
+            decision == FILTER_BLOCK) {
+          unsigned char *mc_running_avg_u =
+              denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
+          unsigned char *running_avg_u =
+              denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
+          unsigned char *mc_running_avg_v =
+              denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
+          unsigned char *running_avg_v =
+              denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
+          int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
+          int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
+          int signal_stride = x->block[16].src_stride;
+          decision_u =
+              vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
+                                      running_avg_u, avg_uv_stride,
+                                      x->block[16].src + *x->block[16].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+          decision_v =
+              vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
+                                      running_avg_v, avg_uv_stride,
+                                      x->block[20].src + *x->block[20].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+        }
    }
    if (decision == COPY_BLOCK)
    {
@@ -372,5 +564,73 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                x->thismb, 16,
                denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
                denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
+        denoiser->denoise_state[block_index] = kNoFilter;
+    }
+    if (apply_color_denoise) {
+      if (decision_u == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+      if (decision_v == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+    }
+    // Option to selectively deblock the denoised signal, for y channel only.
+    if (apply_spatial_loop_filter) {
+      loop_filter_info lfi;
+      int apply_filter_col = 0;
+      int apply_filter_row = 0;
+      int apply_filter = 0;
+      int y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
+      int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
+
+      // Fix filter level to some nominal value for now.
+      int filter_level = 32;
+
+      int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
+      lfi.mblim = lfi_n->mblim[filter_level];
+      lfi.blim = lfi_n->blim[filter_level];
+      lfi.lim = lfi_n->lim[filter_level];
+      lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+      // Apply filter if there is a difference in the denoiser filter state
+      // between the current and left/top block, or if non-zero motion vector
+      // is used for the motion-compensated filtering.
+      if (mb_col > 0) {
+        apply_filter_col = !((denoiser->denoise_state[block_index] ==
+            denoiser->denoise_state[block_index - 1]) &&
+            denoiser->denoise_state[block_index] != kFilterNonZeroMV);
+        if (apply_filter_col) {
+          // Filter left vertical edge.
+          apply_filter = 1;
+          vp8_loop_filter_mbv(
+              denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+              NULL, NULL, y_stride, uv_stride, &lfi);
+        }
+      }
+      if (mb_row > 0) {
+        apply_filter_row = !((denoiser->denoise_state[block_index] ==
+            denoiser->denoise_state[block_index - denoiser->num_mb_cols]) &&
+            denoiser->denoise_state[block_index] != kFilterNonZeroMV);
+        if (apply_filter_row) {
+          // Filter top horizontal edge.
+          apply_filter = 1;
+          vp8_loop_filter_mbh(
+              denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+              NULL, NULL, y_stride, uv_stride, &lfi);
+        }
+      }
+      if (apply_filter) {
+        // Update the signal block |x|. Pixel changes are only to top and/or
+        // left boundary pixels: can we avoid full block copy here.
+        vp8_copy_mem16x16(
+            denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+            y_stride, x->thismb, 16);
+      }
    }
 }
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -12,6 +12,7 @@
 #define VP8_ENCODER_DENOISING_H_

 #include "block.h"
+#include "vp8/common/loopfilter.h"

 #ifdef __cplusplus
 extern "C" {
@@ -21,19 +22,33 @@ extern "C" {
 #define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)

+#define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
+#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
+#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
+
 enum vp8_denoiser_decision
 {
  COPY_BLOCK,
  FILTER_BLOCK
 };

+enum vp8_denoiser_filter_state {
+  kNoFilter,
+  kFilterZeroMV,
+  kFilterNonZeroMV
+};
+
 typedef struct vp8_denoiser
 {
    YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
    YV12_BUFFER_CONFIG yv12_mc_running_avg;
+    unsigned char* denoise_state;
+    int num_mb_cols;
 } VP8_DENOISER;

-int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
+                          int num_mb_rows, int num_mb_cols);

 void vp8_denoiser_free(VP8_DENOISER *denoiser);

@@ -42,7 +57,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                             unsigned int best_sse,
                             unsigned int zero_mv_sse,
                             int recon_yoffset,
-                             int recon_uvoffset);
+                             int recon_uvoffset,
+                             loop_filter_info_n *lfi_n,
+                             int mb_row,
+                             int mb_col,
+                             int block_index);

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1246,7 +1246,7 @@ int vp8cx_encode_inter_macroblock
            x->zbin_mode_boost_enabled = 0;
        }
        vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                               &distortion, &intra_error);
+                               &distortion, &intra_error, mb_row, mb_col);

        /* switch back to the regular quantizer for the encode */
        if (cpi->sf.improved_quant)
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -98,6 +98,9 @@ extern double vp8_calc_ssimg
 #ifdef OUTPUT_YUV_SRC
 FILE *yuv_file;
 #endif
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file;
+#endif

 #if 0
 FILE *framepsnr;
@@ -1748,7 +1751,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
      {
        int width = (cpi->oxcf.Width + 15) & ~15;
        int height = (cpi->oxcf.Height + 15) & ~15;
-        vp8_denoiser_allocate(&cpi->denoiser, width, height);
+        vp8_denoiser_allocate(&cpi->denoiser, width, height,
+                              cpi->common.mb_rows, cpi->common.mb_cols);
      }
    }
 #endif
@@ -1961,6 +1965,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 #ifdef OUTPUT_YUV_SRC
    yuv_file = fopen("bd.yuv", "ab");
 #endif
+#ifdef OUTPUT_YUV_DENOISED
+    yuv_denoised_file = fopen("denoised.yuv", "ab");
+#endif

 #if 0
    framepsnr = fopen("framepsnr.stt", "a");
@@ -2410,6 +2417,9 @@ void vp8_remove_compressor(VP8_COMP **ptr)
 #ifdef OUTPUT_YUV_SRC
    fclose(yuv_file);
 #endif
+#ifdef OUTPUT_YUV_DENOISED
+    fclose(yuv_denoised_file);
+#endif

 #if 0

@@ -2610,7 +2620,7 @@ int vp8_update_entropy(VP8_COMP *cpi, int update)
 }


-#if OUTPUT_YUV_SRC
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
 void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s)
 {
    unsigned char *src = s->y_buffer;
@@ -4430,6 +4440,11 @@ static void encode_frame_to_data_rate

    update_reference_frames(cpi);

+#ifdef OUTPUT_YUV_DENOISED
+    vp8_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
+#endif
+
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
    if (cpi->oxcf.error_resilient_mode)
    {
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -1168,6 +1168,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity)
    {
+        int block_index = mb_row * cpi->common.mb_cols + mb_col;
        if (x->best_sse_inter_mode == DC_PRED)
        {
            /* No best MV found. */
@@ -1179,7 +1180,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        }
        x->increase_denoising = 0;
        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
-                                recon_yoffset, recon_uvoffset);
+                                recon_yoffset, recon_uvoffset,
+                                &cpi->common.lf_info, mb_row, mb_col,
+                                block_index);


        /* Reevaluate ZEROMV after denoising. */
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1935,7 +1935,8 @@ static void update_best_mode(BEST_MODE* best_mode, int this_rd,

 void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                            int recon_uvoffset, int *returnrate,
-                            int *returndistortion, int *returnintra)
+                            int *returndistortion, int *returnintra,
+                            int mb_row, int mb_col)
 {
    BLOCK *b = &x->block[0];
    BLOCKD *d = &x->e_mbd.block[0];
@@ -2510,6 +2511,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity)
    {
+        int block_index = mb_row * cpi->common.mb_cols + mb_col;
        if (x->best_sse_inter_mode == DC_PRED)
        {
            /* No best MV found. */
@@ -2520,7 +2522,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            best_sse = best_rd_sse;
        }
        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
-                                recon_yoffset, recon_uvoffset);
+                                recon_yoffset, recon_uvoffset,
+                                &cpi->common.lf_info, mb_row, mb_col,
+                                block_index);


        /* Reevaluate ZEROMV after denoising. */
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -70,7 +70,10 @@ static void insertsortsad(int arr[],int idx[], int len)
 }

 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
-extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
+                                   int recon_yoffset, int recon_uvoffset,
+                                   int *returnrate, int *returndistortion,
+                                   int *returnintra, int mb_row, int mb_col);
 extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);


--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -17,10 +17,23 @@
 #include <emmintrin.h>
 #include "vpx_ports/emmintrin_compat.h"

-union sum_union {
-    __m128i v;
-    signed char e[16];
-};
+/* Compute the sum of all pixel differences of this MB. */
+static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo = _mm_srai_epi16(
+      _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi = _mm_srai_epi16(
+      _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                          _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                         _mm_srli_si128(hgfe_dcba, 4));
+  unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba);
+
+  return abs(sum_diff);
+}

 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
                             int mc_avg_y_stride,
@@ -31,7 +44,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
 {
    unsigned char *running_avg_y_start = running_avg_y;
    unsigned char *sig_start = sig;
-    int sum_diff_thresh;
+    unsigned int sum_diff_thresh;
    int r;
    int shift_inc  = (increase_denoising &&
        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
@@ -103,16 +116,10 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

    {
        /* Compute the sum of all pixel differences of this MB. */
-        union sum_union s;
-        int sum_diff = 0;
-        s.v = acc_diff;
-        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
-                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
-                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];
-
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
        sum_diff_thresh = SUM_DIFF_THRESHOLD;
        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
-        if (abs(sum_diff) > sum_diff_thresh) {
+        if (abs_sum_diff > sum_diff_thresh) {
          // Before returning to copy the block (i.e., apply no denoising),
          // checK if we can still apply some (weaker) temporal filtering to
          // this block, that would otherwise not be denoised at all. Simplest
@@ -123,7 +130,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

          // The delta is set by the excess of absolute pixel diff over the
          // threshold.
-          int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
          // Only apply the adjustment for max delta up to 3.
          if (delta < 4) {
            const __m128i k_delta = _mm_set1_epi8(delta);
@@ -162,16 +169,9 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
             mc_running_avg_y += mc_avg_y_stride;
             running_avg_y += avg_y_stride;
            }
-            {
-              // Update the sum of all pixel differences of this MB.
-              union sum_union s;
-              s.v = acc_diff;
-              sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
-                       + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
-                       + s.e[12] + s.e[13] + s.e[14] + s.e[15];
-              if (abs(sum_diff) > sum_diff_thresh) {
-                return COPY_BLOCK;
-              }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
            }
          } else {
            return COPY_BLOCK;
@@ -182,3 +182,198 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
    vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
    return FILTER_BLOCK;
 }
+
+int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
+                             int mc_avg_stride,
+                             unsigned char *running_avg, int avg_stride,
+                             unsigned char *sig, int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_start = running_avg;
+    unsigned char *sig_start = sig;
+    unsigned int sum_diff_thresh;
+    int r;
+    int shift_inc  = (increase_denoising &&
+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
+    __m128i acc_diff = _mm_setzero_si128();
+    const __m128i k_0 = _mm_setzero_si128();
+    const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+    const __m128i k_8 = _mm_set1_epi8(8);
+    const __m128i k_16 = _mm_set1_epi8(16);
+    /* Modify each level's adjustment according to motion_magnitude. */
+    const __m128i l3 = _mm_set1_epi8(
+                       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ?
+                        7 + shift_inc : 6);
+    /* Difference between level 3 and level 2 is 2. */
+    const __m128i l32 = _mm_set1_epi8(2);
+    /* Difference between level 2 and level 1 is 1. */
+    const __m128i l21 = _mm_set1_epi8(1);
+
+    {
+      const __m128i k_1 = _mm_set1_epi16(1);
+      __m128i vec_sum_block = _mm_setzero_si128();
+
+      // Avoid denoising color signal if its close to average level.
+      for (r = 0; r < 8; ++r) {
+        const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
+        const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
+        vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
+        sig += sig_stride;
+      }
+      sig -= sig_stride * 8;
+      {
+        const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
+        const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                                _mm_srli_si128(hg_fe_dc_ba, 8));
+        const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                               _mm_srli_si128(hgfe_dcba, 4));
+        const int sum_block = _mm_cvtsi128_si32(hgfedcba);
+        if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+          return COPY_BLOCK;
+        }
+      }
+    }
+
+    for (r = 0; r < 4; ++r) {
+        /* Calculate differences */
+        const __m128i v_sig_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&sig[0])));
+        const __m128i v_sig = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                         (double *)(&sig[sig_stride])));
+        const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&mc_running_avg[0])));
+        const __m128i v_mc_running_avg = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                         (double *)(&mc_running_avg[mc_avg_stride])));
+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+        /* Obtain the sign. FF if diff is negative. */
+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+        /* Clamp absolute difference to 16 to be used to get mask. Doing this
+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+        const __m128i clamped_absdiff = _mm_min_epu8(
+                                        _mm_or_si128(pdiff, ndiff), k_16);
+        /* Get masks for l2 l1 and l0 adjustments */
+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+        /* Get adjustments for l2, l1, and l0 */
+        __m128i adj2 = _mm_and_si128(mask2, l32);
+        const __m128i adj1 = _mm_and_si128(mask1, l21);
+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+        __m128i adj,  padj, nadj;
+        __m128i v_running_avg;
+
+        /* Combine the adjustments and get absolute adjustments. */
+        adj2 = _mm_add_epi8(adj2, adj1);
+        adj = _mm_sub_epi8(l3, adj2);
+        adj = _mm_andnot_si128(mask0, adj);
+        adj = _mm_or_si128(adj, adj0);
+
+        /* Restore the sign and get positive and negative adjustments. */
+        padj = _mm_andnot_si128(diff_sign, adj);
+        nadj = _mm_and_si128(diff_sign, adj);
+
+        /* Calculate filtered value. */
+        v_running_avg = _mm_adds_epu8(v_sig, padj);
+        v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
+
+        _mm_storel_pd((double *)&running_avg[0],
+                      _mm_castsi128_pd(v_running_avg));
+        _mm_storeh_pd((double *)&running_avg[avg_stride],
+                      _mm_castsi128_pd(v_running_avg));
+
+        /* Adjustments <=7, and each element in acc_diff can fit in signed
+         * char.
+         */
+        acc_diff = _mm_adds_epi8(acc_diff, padj);
+        acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride * 2;
+        mc_running_avg += mc_avg_stride * 2;
+        running_avg += avg_stride * 2;
+    }
+
+    {
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+        if (abs_sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // checK if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the accceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const __m128i k_delta = _mm_set1_epi8(delta);
+            sig -= sig_stride * 8;
+            mc_running_avg -= mc_avg_stride * 8;
+            running_avg -= avg_stride * 8;
+            for (r = 0; r < 4; ++r) {
+              // Calculate differences.
+              const __m128i v_sig_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&sig[0])));
+              const __m128i v_sig = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                               (double *)(&sig[sig_stride])));
+              const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&mc_running_avg[0])));
+              const __m128i v_mc_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                               (double *)(&mc_running_avg[mc_avg_stride])));
+              const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+              const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+              // Obtain the sign. FF if diff is negative.
+              const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+              // Clamp absolute difference to delta to get the adjustment.
+              const __m128i adj =
+                  _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+              // Restore the sign and get positive and negative adjustments.
+              __m128i padj, nadj;
+              const __m128i v_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&running_avg[0])));
+              __m128i v_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
+                               (double *)(&running_avg[avg_stride])));
+              padj = _mm_andnot_si128(diff_sign, adj);
+              nadj = _mm_and_si128(diff_sign, adj);
+              // Calculate filtered value.
+              v_running_avg = _mm_subs_epu8(v_running_avg, padj);
+              v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
+
+              _mm_storel_pd((double *)&running_avg[0],
+                            _mm_castsi128_pd(v_running_avg));
+              _mm_storeh_pd((double *)&running_avg[avg_stride],
+                            _mm_castsi128_pd(v_running_avg));
+
+             // Accumulate the adjustments.
+             acc_diff = _mm_subs_epi8(acc_diff, padj);
+             acc_diff = _mm_adds_epi8(acc_diff, nadj);
+
+             // Update pointers for next iteration.
+             sig += sig_stride * 2;
+             mc_running_avg += mc_avg_stride * 2;
+             running_avg += avg_stride * 2;
+            }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
+            }
+          } else {
+            return COPY_BLOCK;
+          }
+        }
+    }
+
+    vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
+    return FILTER_BLOCK;
+}
--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -26,11 +26,10 @@
        int cmp = (x[z] < boost) | (y[z] == 0); \
        zbin_boost_ptr++; \
        if (cmp) \
-            goto select_eob_end_##i; \
+            break; \
        qcoeff_ptr[z] = y[z]; \
        eob = i; \
        zbin_boost_ptr = b->zrun_zbin_boost; \
-        select_eob_end_##i:; \
    } while (0)

 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -1,256 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse4) PRIVATE
-sym(vp8_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_zbin]
-    mov         rdx, [rdi + vp8_block_round]
-    movd        xmm7, [rdi + vp8_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp8_block_quant_shift]
-    mov         rcx, [rdi + vp8_block_quant]
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp8_blockd_dequant]
-    mov         rdi, [rsi + vp8_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp8/common/entropy.c: vp8_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vp8_rtcd.h"
+#include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#define SELECT_EOB(i, z, x, y, q) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        short x_z = _mm_extract_epi16(x, z); \
+        short y_z = _mm_extract_epi16(y, z); \
+        int cmp = (x_z < boost) | (y_z == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            break; \
+        q = _mm_insert_epi16(q, y_z, z); \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1,
+            dqcoeff0, dqcoeff1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+    __m128i qcoeff0 = _mm_setzero_si128();
+    __m128i qcoeff1 = _mm_setzero_si128();
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+
+    _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
+    _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
+
+    dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0);
+    dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
+
+    *d->eob = eob;
+}
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -107,7 +107,6 @@ VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm

 ifeq ($(CONFIG_POSTPROC),yes)
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -60,6 +60,7 @@ struct vpx_codec_alg_priv
    vpx_decrypt_cb          decrypt_cb;
    void                    *decrypt_state;
    vpx_image_t             img;
+    int                     flushed;
    int                     img_setup;
    struct frame_buffers    yv12_frame_buffers;
    void                    *user_priv;
@@ -89,6 +90,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
    ctx->priv->alg_priv->decrypt_cb = NULL;
    ctx->priv->alg_priv->decrypt_state = NULL;
    ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->alg_priv->flushed = 0;

    if (ctx->config.dec)
    {
@@ -327,6 +329,13 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    unsigned int resolution_change = 0;
    unsigned int w, h;

+    if (data == NULL && data_sz == 0) {
+      ctx->flushed = 1;
+      return VPX_CODEC_OK;
+    }
+
+    /* Reset flushed when receiving a valid frame */
+    ctx->flushed = 0;

    /* Update the input fragment data */
    if(update_fragments(ctx, data, data_sz, &res) <= 0)
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,6 +89,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c

 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
@@ -97,7 +98,6 @@ endif
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -25,12 +25,14 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;

-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_c(src, src_stride,
+                    dst, dst_stride,
+                    filter_x, x_step_q4,
+                    filter_y, y_step_q4,
+                    w, h);
+    return;
+  }

  /* Filter starting 3 lines back. The neon implementation will ignore the
   * given height and filter a multiple of 4 lines. Since this goes in to
@@ -57,12 +59,14 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
  int intermediate_height = h + 7;

-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_avg_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_avg_c(src, src_stride,
+                        dst, dst_stride,
+                        filter_x, x_step_q4,
+                        filter_y, y_step_q4,
+                        w, h);
+    return;
+  }

  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -9,6 +9,7 @@
 */

 #include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"

 void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                    const uint8_t *blimit0,
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -12,11 +12,37 @@
 #include "vpx_mem/vpx_mem.h"

 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_systemdependent.h"

+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE void alloc_mi_array(VP9_COMMON *cm, int mi_size, int idx) {
+  CHECK_MEM_ERROR(cm, cm->mip_array[idx],
+                  vpx_calloc(mi_size, sizeof(*cm->mip_array[0])));
+  CHECK_MEM_ERROR(cm, cm->mi_grid_base_array[idx],
+                  vpx_calloc(mi_size, sizeof(*cm->mi_grid_base_array[0])));
+}
+
 static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
  int i;

@@ -49,40 +75,47 @@ static void setup_mi(VP9_COMMON *cm) {
  vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) *
                                      sizeof(*cm->mi_grid_base));

-  clear_mi_border(cm, cm->prev_mip);
+  // Only clear mi border in non frame-parallel decode. In frame-parallel
+  // decode, prev_mip is managed by previous decoding thread. While in
+  // non frame-parallel decode, prev_mip and mip are both managed by
+  // current decoding thread.
+  if (!cm->frame_parallel_decode)
+    clear_mi_border(cm, cm->prev_mip);
 }

 static int alloc_mi(VP9_COMMON *cm, int mi_size) {
  int i;

-  for (i = 0; i < 2; ++i) {
-    cm->mip_array[i] =
-        (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
-    if (cm->mip_array[i] == NULL)
-      return 1;
-
-    cm->mi_grid_base_array[i] =
-        (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
-    if (cm->mi_grid_base_array[i] == NULL)
-      return 1;
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    // Delay reallocation as another thread is accessing prev_mi.
+    if (cm->frame_parallel_decode && i == cm->prev_mi_idx) {
+      cm->update_prev_mi = 1;
+      continue;
+    }
+    alloc_mi_array(cm, mi_size, i);
  }

-  // Init the index.
-  cm->mi_idx = 0;
-  cm->prev_mi_idx = 1;
-
  cm->mip = cm->mip_array[cm->mi_idx];
-  cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
  cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
-  cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
+
+  if (!cm->frame_parallel_decode) {
+    cm->mi_idx = 0;
+    cm->prev_mi_idx = 1;
+    // In frame-parallel decode, prev_mip comes from another thread,
+    // so current decoding thread should not touch it.
+    cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+    cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
+  }

  return 0;
 }

-static void free_mi(VP9_COMMON *cm) {
+static void free_mi(VP9_COMMON *cm, int decode_done) {
  int i;

-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    if (cm->frame_parallel_decode && i == cm->prev_mi_idx && !decode_done)
+      continue;
    vpx_free(cm->mip_array[i]);
    cm->mip_array[i] = NULL;
    vpx_free(cm->mi_grid_base_array[i]);
@@ -90,30 +123,71 @@ static void free_mi(VP9_COMMON *cm) {
  }

  cm->mip = NULL;
-  cm->prev_mip = NULL;
  cm->mi_grid_base = NULL;
-  cm->prev_mi_grid_base = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->prev_mip = NULL;
+    cm->prev_mi_grid_base = NULL;
+  }
+}
+
+static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+    if (cm->seg_map_array[i] == NULL)
+      return 1;
+  }
+
+  // Init the index.
+  cm->seg_map_idx = 0;
+  cm->prev_seg_map_idx = 1;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+  }
+
+  return 0;
+}
+
+static void free_seg_map(VP9_COMMON *cm) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    vpx_free(cm->seg_map_array[i]);
+    cm->seg_map_array[i] = NULL;
+  }
+
+  cm->current_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
 }

 void vp9_free_frame_buffers(VP9_COMMON *cm) {
  int i;
+  BufferPool *const pool = cm->buffer_pool;

  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
+    vp9_free_frame_buffer(&pool->frame_bufs[i].buf);

-    if (cm->frame_bufs[i].ref_count > 0 &&
-        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
-      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
-      cm->frame_bufs[i].ref_count = 0;
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
    }
  }

  vp9_free_frame_buffer(&cm->post_proc_buffer);
+}

-  free_mi(cm);
-
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = NULL;
+void vp9_free_context_buffers(VP9_COMMON *cm) {
+  free_mi(cm, 1);
+  free_seg_map(cm);

  vpx_free(cm->above_context);
  cm->above_context = NULL;
@@ -125,25 +199,27 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
 int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;

+  // TODO(agrange): this should be conditionally allocated.
  if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
    goto fail;
+#endif

  set_mb_mi(cm, aligned_width, aligned_height);

-  free_mi(cm);
+  free_mi(cm, 0);
  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
    goto fail;

  setup_mi(cm);

  // Create the segmentation map structure and set to 0.
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
-  if (!cm->last_frame_seg_map)
+  free_seg_map(cm);
+  if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
    goto fail;

  vpx_free(cm->above_context);
@@ -165,36 +241,58 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {

 fail:
  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
  return 1;
 }

+static void init_frame_bufs(VP9_COMMON *cm) {
+  int i;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  cm->new_fb_idx = FRAME_BUFFERS - 1;
+  frame_bufs[cm->new_fb_idx].ref_count = 1;
+
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = i;
+    frame_bufs[i].ref_count = 1;
+  }
+}
+
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  int i;
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;
-  int i;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  vp9_free_frame_buffers(cm);

-  for (i = 0; i < FRAME_BUFFERS; i++) {
-    cm->frame_bufs[i].ref_count = 0;
-    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    frame_bufs[i].ref_count = 0;
+    if (vp9_alloc_frame_buffer(&frame_bufs[i].buf, width, height,
                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
      goto fail;
  }

-  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
-  for (i = 0; i < REF_FRAMES; i++) {
-    cm->ref_frame_map[i] = i;
-    cm->frame_bufs[i].ref_count = 1;
-  }
+  init_frame_bufs(cm);

+#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9_ENC_BORDER_IN_PIXELS) < 0)
    goto fail;
+#endif
+
+  return 0;
+
+ fail:
+  vp9_free_frame_buffers(cm);
+  return 1;
+}
+
+int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
+  vp9_free_context_buffers(cm);

  set_mb_mi(cm, aligned_width, aligned_height);

@@ -224,13 +322,13 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
  return 0;

 fail:
-  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
  return 1;
 }

 void vp9_remove_common(VP9_COMMON *cm) {
  vp9_free_frame_buffers(cm);
-  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
+  vp9_free_context_buffers(cm);
 }

 void vp9_update_frame_size(VP9_COMMON *cm) {
@@ -241,13 +339,27 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
  setup_mi(cm);

  // Initialize the previous frame segment map to 0.
-  if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+  if (cm->current_frame_seg_map)
+    vpx_memset(cm->current_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }

 void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
  // Swap indices.
  const int tmp = cm->mi_idx;
+
+  // Only used in frame parallel decode: Update the prev_mi buffer if
+  // needed. The worker that was accessing it must already finish decoding.
+  // So it can be resized safely now.
+  if (cm->update_prev_mi) {
+    const int mi_size = cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE);
+    vpx_free(cm->mip_array[cm->prev_mi_idx]);
+    vpx_free(cm->mi_grid_base_array[cm->prev_mi_idx]);
+    cm->mip_array[cm->prev_mi_idx] = NULL;
+    cm->mi_grid_base_array[cm->prev_mi_idx] = NULL;
+    alloc_mi_array(cm, mi_size, cm->prev_mi_idx);
+    cm->update_prev_mi = 0;
+  }
+
  cm->mi_idx = cm->prev_mi_idx;
  cm->prev_mi_idx = tmp;

@@ -263,3 +375,13 @@ void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 }
+
+void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) {
+  // Swap indices.
+  const int tmp = cm->seg_map_idx;
+  cm->seg_map_idx = cm->prev_seg_map_idx;
+  cm->prev_seg_map_idx = tmp;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+}
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -23,13 +23,19 @@ void vp9_remove_common(struct VP9Common *cm);
 int vp9_resize_frame_buffers(struct VP9Common *cm, int width, int height);

 int vp9_alloc_frame_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);

 void vp9_free_frame_buffers(struct VP9Common *cm);
+void vp9_free_state_buffers(struct VP9Common *cm);
+void vp9_free_context_buffers(struct VP9Common *cm);

 void vp9_update_frame_size(struct VP9Common *cm);

 void vp9_swap_mi_and_prev_mi(struct VP9Common *cm);

+void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -44,7 +44,7 @@ void vp9_foreach_transformed_block_in_plane(
  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
  // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd)
                                : mbmi->tx_size;
  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -270,18 +270,20 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,

 void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);

-static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
+static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
+                                          int xss, int yss) {
  if (bsize < BLOCK_8X8) {
    return TX_4X4;
  } else {
-    // TODO(dkovalev): Assuming YUV420 (ss_x == 1, ss_y == 1)
-    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][1][1];
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
  }
 }

-static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type);
+static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                     const struct macroblockd_plane *pd) {
+  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
+                             pd->subsampling_y);
 }

 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -117,17 +117,25 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
                     const InterpKernel *const y_filters,
                     int y0_q4, int y_step_q4,
                     int w, int h) {
-  // Fixed size intermediate buffer places limits on parameters.
-  // Maximum intermediate_height is 324, for y_step_q4 == 80,
-  // h == 64, taps == 8.
-  // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
-  uint8_t temp[64 * 324];
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint8_t temp[135 * 64];
  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;

  assert(w <= 64);
  assert(h <= 64);
-  assert(y_step_q4 <= 80);
-  assert(x_step_q4 <= 80);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);

  if (intermediate_height < h)
    intermediate_height = h;
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -439,9 +439,13 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
  int i;
  vp9_clearall_segfeatures(&cm->seg);
  cm->seg.abs_delta = SEGMENT_DELTADATA;
-  if (cm->last_frame_seg_map)
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));

+  if (cm->current_frame_seg_map)
+    vpx_memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
  // Reset the mode ref deltas for loop filter
  vp9_zero(lf->last_ref_deltas);
  vp9_zero(lf->last_mode_deltas);
@@ -464,7 +468,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
  }

-  if (frame_is_intra_only(cm))
+  if (frame_is_intra_only(cm) && !cm->frame_parallel_decode)
    vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
                                    sizeof(*cm->prev_mip));

--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c
@@ -76,6 +76,7 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
 int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
  (void)cb_priv;
-  int_fb->in_use = 0;
+  if (int_fb)
+    int_fb->in_use = 0;
  return 0;
 }
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -502,7 +502,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
  const MB_MODE_INFO *mbmi = &mi->mbmi;
  const BLOCK_SIZE block_size = mbmi->sb_type;
  const TX_SIZE tx_size_y = mbmi->tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
  const int filter_level = get_filter_level(lfi_n, mbmi);
  uint64_t *const left_y = &lfm->left_y[tx_size_y];
  uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -939,7 +939,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
      const int skip_this_r = skip_this && !block_edge_above;
      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi)
+                            ? get_uv_tx_size(&mi[0].mbmi, plane)
                            : mi[0].mbmi.tx_size;
      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,195 +11,18 @@

 #include "vp9/common/vp9_mvref_common.h"

-#define MVREF_NEIGHBOURS 8
-
-typedef struct position {
-  int row;
-  int col;
-} POSITION;
-
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[MB_MODE_COUNT] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D207_PRED
-  9,  // D63_PRED
-  9,  // TM_PRED
-  0,  // NEARESTMV
-  0,  // NEARMV
-  3,  // ZEROMV
-  1,  // NEWMV
-};
-
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,  // 0
-  NEW_PLUS_NON_INTRA,  // 1
-  BOTH_NEW,  // 2
-  ZERO_PLUS_PREDICTED,  // 3
-  NEW_PLUS_NON_INTRA,  // 4
-  INVALID_CASE,  // 5
-  BOTH_ZERO,  // 6
-  INVALID_CASE,  // 7
-  INVALID_CASE,  // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,  // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,  // 13
-  INVALID_CASE,  // 14
-  INVALID_CASE,  // 15
-  INVALID_CASE,  // 16
-  INVALID_CASE,  // 17
-  BOTH_INTRA  // 18
-};
-
-static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
-  // 4X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 4X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // 16X8
-  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // 16X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 16X32
-  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X16
-  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X32
-  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X64
-  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // 64X32
-  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
-};
-
-static const int idx_n_column_to_subblock[4][2] = {
-  {1, 2},
-  {1, 3},
-  {3, 2},
-  {3, 3}
-};
-
-// clamp_mv_ref
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-
-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
-                                      int search_col, int block_idx) {
-  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
-          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
-              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv];
-}
-
-
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
-  }
-  return mv;
-}
-
-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
-  do { \
-    if (refmv_count) { \
-      if ((mv).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (mv); \
-        goto Done; \
-      } \
-    } else { \
-      mv_ref_list[refmv_count++] = (mv); \
-    } \
-  } while (0)
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
-  do { \
-    if (is_inter_block(mbmi)) { \
-      if ((mbmi)->ref_frame[0] != ref_frame) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
-      if (has_second_ref(mbmi) && \
-          (mbmi)->ref_frame[1] != ref_frame && \
-          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
-    } \
-  } while (0)
-
-
-// Checks that the given mi_row, mi_col and search point
-// are inside the borders of the tile.
-static INLINE int is_inside(const TileInfo *const tile,
-                            int mi_col, int mi_row, int mi_rows,
-                            const POSITION *mi_pos) {
-  return !(mi_row + mi_pos->row < 0 ||
-           mi_col + mi_pos->col < tile->mi_col_start ||
-           mi_row + mi_pos->row >= mi_rows ||
-           mi_col + mi_pos->col >= tile->mi_col_end);
-}
-
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                             const TileInfo *const tile,
                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                             int_mv *mv_ref_list,
-                             int block, int mi_row, int mi_col) {
+                             int block, int mi_row, int mi_col,
+                             find_mv_refs_sync sync, void *const data) {
  const int *ref_sign_bias = cm->ref_frame_sign_bias;
  int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi
-        ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]
-        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
-
+  MODE_INFO *prev_mi = NULL;
+  MB_MODE_INFO *prev_mbmi = NULL;

  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];

@@ -246,6 +69,14 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
    }
  }

+  // Synchronize here for frame parallel decode if sync function is provided.
+  if (sync != NULL) {
+    sync(data, mi_row);
+  }
+  prev_mi = cm->coding_use_prev_mi && cm->prev_mi ?
+            cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] : NULL;
+  prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+
  // Check the last frame's mode and mv info.
  if (prev_mbmi) {
    if (prev_mbmi->ref_frame[0] == ref_frame)
@@ -284,12 +115,13 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
 }

 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data) {
  find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
-                   mi_row, mi_col);
+                   mi_row, mi_col, sync, data);
 }

 static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -327,7 +159,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
  assert(MAX_MV_REF_CANDIDATES == 2);

  find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col);
+                   mi_row, mi_col, NULL, NULL);

  near->as_int = 0;
  switch (block) {
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -21,6 +21,181 @@ extern "C" {
 #define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
                                VP9_INTERP_EXTEND) << 3)

+#define MVREF_NEIGHBOURS 8
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+};
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv];
+}
+
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(mv) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (mv); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_list[refmv_count++] = (mv); \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+    } \
+  } while (0)
+
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
@@ -29,10 +204,12 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }

+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                      const TileInfo *const tile,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int mi_row, int mi_col);
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data);

 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_frame_buffers.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"

 #if CONFIG_VP9_POSTPROC
@@ -35,14 +36,19 @@ extern "C" {
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)

-// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 4)
+#define FRAME_BUFFERS (REF_FRAMES + 7)

 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)

+#define NUM_PING_PONG_BUFFERS 2
+
 extern const struct {
  PARTITION_CONTEXT above;
  PARTITION_CONTEXT left;
@@ -61,8 +67,40 @@ typedef struct {
  int ref_count;
  vpx_codec_frame_buffer_t raw_frame_buffer;
  YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  VP9Worker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
 } RefCntBuffer;

+typedef struct {
+  // Protect BufferPool from being accessed by several FrameWorkers at
+  // the same time during frame parallel decode.
+  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
 typedef struct VP9Common {
  struct vpx_internal_error_info  error;

@@ -89,10 +127,12 @@ typedef struct VP9Common {

  YV12_BUFFER_CONFIG *frame_to_show;

-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
-
  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */

+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
  // roll new_fb_idx into it.

@@ -144,8 +184,8 @@ typedef struct VP9Common {

  int mi_idx;
  int prev_mi_idx;
-  MODE_INFO *mip_array[2];
-  MODE_INFO **mi_grid_base_array[2];
+  MODE_INFO *mip_array[NUM_PING_PONG_BUFFERS];
+  MODE_INFO **mi_grid_base_array[NUM_PING_PONG_BUFFERS];

  MODE_INFO *mip; /* Base of allocated array */
  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
@@ -157,8 +197,16 @@ typedef struct VP9Common {
  MODE_INFO **prev_mi_grid_base;
  MODE_INFO **prev_mi_grid_visible;

+  // Used in frame parallel decode for delay resizing prev_mi.
+  int update_prev_mi;
+
  // Persistent mb segment id map used in prediction.
-  unsigned char *last_frame_seg_map;
+  int seg_map_idx;
+  int prev_seg_map_idx;
+
+  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;

  INTERP_FILTER interp_filter;

@@ -171,6 +219,10 @@ typedef struct VP9Common {
  struct loopfilter lf;
  struct segmentation seg;

+  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
+  // in pbi.
+  int frame_parallel_decode;  // frame-based threading.
+
  // Context probabilities for reference frame prediction
  int allow_comp_inter_inter;
  MV_REFERENCE_FRAME comp_fixed_ref;
@@ -202,30 +254,34 @@ typedef struct VP9Common {

  int log2_tile_cols, log2_tile_rows;

-  // Private data associated with the frame buffer callbacks.
-  void *cb_priv;
-  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
-
-  // Handles memory for the codec.
-  InternalFrameBufferList int_frame_buffers;
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;

  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;
 } VP9_COMMON;

+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool);
+void unlock_buffer_pool(BufferPool *const pool);
+
 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->frame_bufs[cm->new_fb_idx].buf;
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }

 static INLINE int get_free_fb(VP9_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  int i;
-  for (i = 0; i < FRAME_BUFFERS; i++)
-    if (cm->frame_bufs[i].ref_count == 0)
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0)
      break;

  assert(i < FRAME_BUFFERS);
-  cm->frame_bufs[i].ref_count = 1;
+  frame_bufs[i].ref_count = 1;
+  unlock_buffer_pool(cm->buffer_pool);
  return i;
 }

@@ -310,7 +366,6 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);

-  // num_4x4_blocks_wide_lookup[bsize] / 2
  const int bs = num_8x8_blocks_wide_lookup[bsize];

  // update the partition context at the end notes. set partition bits
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -12,7 +12,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"

-#if 1
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
  4,       8,    8,    9,   10,   11,   12,   12,
  13,     14,   15,   16,   17,   18,   19,   19,
@@ -83,44 +82,6 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = {
  1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };

-void vp9_init_quant_tables(void) { }
-#else
-static int16_t dc_qlookup[QINDEX_RANGE];
-static int16_t ac_qlookup[QINDEX_RANGE];
-
-#define ACDC_MIN 8
-
-// TODO(dkovalev) move to common and reuse
-static double poly3(double a, double b, double c, double d, double x) {
-  return a*x*x*x + b*x*x + c*x + d;
-}
-
-void vp9_init_quant_tables() {
-  int i, val = 4;
-
-  // A "real" q of 1.0 forces lossless mode.
-  // In practice non lossless Q's between 1.0 and 2.0 (represented here by
-  // integer values from 5-7 give poor rd results (lower psnr and often
-  // larger size than the lossless encode. To block out those "not very useful"
-  // values we increment the ac and dc q lookup values by 4 after position 0.
-  ac_qlookup[0] = val;
-  dc_qlookup[0] = val;
-  val += 4;
-
-  for (i = 1; i < QINDEX_RANGE; i++) {
-    const int ac_val = val;
-
-    val = (int)(val * 1.01975);
-    if (val == ac_val)
-      ++val;
-
-    ac_qlookup[i] = (int16_t)ac_val;
-    dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9,
-                                                 0.5, ac_val));
-  }
-}
-#endif
-
 int16_t vp9_dc_quant(int qindex, int delta) {
  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
 }
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -22,8 +22,6 @@ extern "C" {
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8

-void vp9_init_quant_tables();
-
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);

--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,50 +20,7 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"

-static void build_mc_border(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            int x, int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      memset(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right)
-      memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-static void inter_predictor(const uint8_t *src, int src_stride,
+void inter_predictor(const uint8_t *src, int src_stride,
                            uint8_t *dst, int dst_stride,
                            const int subpel_x,
                            const int subpel_y,
@@ -113,6 +70,18 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
  return res;
 }

+static INLINE int round_mv_comp_q2(int value) {
+  return (value < 0 ? value - 1 : value + 1) / 2;
+}
+
+static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
+  MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row +
+                              mi->bmi[block1].as_mv[idx].as_mv.row),
+             round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col +
+                              mi->bmi[block1].as_mv[idx].as_mv.col) };
+  return res;
+}
+
 // TODO(jkoleszar): yet another mv clamping function :-(
 MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
                             int bw, int bh, int ss_x, int ss_y) {
@@ -139,7 +108,30 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
  return clamped_mv;
 }

-static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
+                            const MODE_INFO *mi, int ref, int block) {
+  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
+  MV res = {0, 0};
+  switch (ss_idx) {
+    case 0:
+      res = mi->bmi[block].as_mv[ref].as_mv;
+      break;
+    case 1:
+      res = mi_mv_pred_q2(mi, ref, block, block + 2);
+      break;
+    case 2:
+      res = mi_mv_pred_q2(mi, ref, block, block + 1);
+      break;
+    case 3:
+      res = mi_mv_pred_q4(mi, ref);
+      break;
+    default:
+      assert(ss_idx <= 3 || ss_idx >= 0);
+  }
+  return res;
+}
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                   int bw, int bh,
                                   int x, int y, int w, int h,
                                   int mi_x, int mi_y) {
@@ -154,14 +146,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
    struct buf_2d *const pre_buf = &pd->pre[ref];
    struct buf_2d *const dst_buf = &pd->dst;
    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-
-    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
-    // same MV (the average of the 4 luma MVs) but we could do something
-    // smarter for non-4:2:0. Just punt for now, pending the changes to get
-    // rid of SPLITMV mode entirely.
    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
-                             : mi_mv_pred_q4(mi, ref))
+               ? average_split_mvs(pd, plane, mi, ref, block)
               : mi->mbmi.mv[ref].as_mv;

    // TODO(jkoleszar): This clamping is done in the incorrect place for the
@@ -241,174 +227,6 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    MAX_MB_PLANE - 1);
 }

-// TODO(jingning): This function serves as a placeholder for decoder prediction
-// using on demand border extension. It should be moved to /decoder/ directory.
-static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                       int bw, int bh,
-                                       int x, int y, int w, int h,
-                                       int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
-  int ref;
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-    struct buf_2d *const pre_buf = &pd->pre[ref];
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-
-    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
-    // same MV (the average of the 4 luma MVs) but we could do something
-    // smarter for non-4:2:0. Just punt for now, pending the changes to get
-    // rid of SPLITMV mode entirely.
-    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
-                             : mi_mv_pred_q4(mi, ref))
-               : mi->mbmi.mv[ref].as_mv;
-
-    // TODO(jkoleszar): This clamping is done in the incorrect place for the
-    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-    // MV. Note however that it performs the subsampling aware scaling so
-    // that the result is always q4.
-    // mv_precision precision is MV_PRECISION_Q4.
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                               pd->subsampling_x,
-                                               pd->subsampling_y);
-
-    MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
-        subpel_x, subpel_y;
-    uint8_t *ref_frame, *buf_ptr;
-    const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
-
-    // Get reference frame pointer, width and height.
-    if (plane == 0) {
-      frame_width = ref_buf->y_crop_width;
-      frame_height = ref_buf->y_crop_height;
-      ref_frame = ref_buf->y_buffer;
-    } else {
-      frame_width = ref_buf->uv_crop_width;
-      frame_height = ref_buf->uv_crop_height;
-      ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
-    }
-
-    if (vp9_is_scaled(sf)) {
-      // Co-ordinate of containing block to pixel precision.
-      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = (x_start + x) << SUBPEL_BITS;
-      y0_16 = (y_start + y) << SUBPEL_BITS;
-
-      // Co-ordinate of current block in reference frame
-      // to 1/16th pixel precision.
-      x0_16 = sf->scale_value_x(x0_16, sf);
-      y0_16 = sf->scale_value_y(y0_16, sf);
-
-      // Map the top left corner of the block into the reference frame.
-      x0 = sf->scale_value_x(x_start + x, sf);
-      y0 = sf->scale_value_y(y_start + y, sf);
-
-      // Scale the MV and incorporate the sub-pixel offset of the block
-      // in the reference frame.
-      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-      xs = sf->x_step_q4;
-      ys = sf->y_step_q4;
-    } else {
-      // Co-ordinate of containing block to pixel precision.
-      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = x0 << SUBPEL_BITS;
-      y0_16 = y0 << SUBPEL_BITS;
-
-      scaled_mv.row = mv_q4.row;
-      scaled_mv.col = mv_q4.col;
-      xs = ys = 16;
-    }
-    subpel_x = scaled_mv.col & SUBPEL_MASK;
-    subpel_y = scaled_mv.row & SUBPEL_MASK;
-
-    // Calculate the top left corner of the best matching block in the reference frame.
-    x0 += scaled_mv.col >> SUBPEL_BITS;
-    y0 += scaled_mv.row >> SUBPEL_BITS;
-    x0_16 += scaled_mv.col;
-    y0_16 += scaled_mv.row;
-
-    // Get reference block pointer.
-    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-    buf_stride = pre_buf->stride;
-
-    // Do border extension if there is motion or the
-    // width/height is not a multiple of 8 pixels.
-    if (scaled_mv.col || scaled_mv.row ||
-        (frame_width & 0x7) || (frame_height & 0x7)) {
-      // Get reference block bottom right coordinate.
-      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-      int x_pad = 0, y_pad = 0;
-
-      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
-        x0 -= VP9_INTERP_EXTEND - 1;
-        x1 += VP9_INTERP_EXTEND;
-        x_pad = 1;
-      }
-
-      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
-        y0 -= VP9_INTERP_EXTEND - 1;
-        y1 += VP9_INTERP_EXTEND;
-        y_pad = 1;
-      }
-
-      // Skip border extension if block is inside the frame.
-      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
-          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
-        // Extend the border.
-        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1,
-                        x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width,
-                        frame_height);
-        buf_stride = x1 - x0 + 1;
-        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-      }
-    }
-
-    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-  }
-}
-
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
-
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
-      int i = 0, x, y;
-      assert(bsize == BLOCK_8X8);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
-    } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
-    }
-  }
-}
-
 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                          const YV12_BUFFER_CONFIG *src,
                          int mi_row, int mi_col) {
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -18,6 +18,26 @@
 extern "C" {
 #endif

+void inter_predictor(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            const int subpel_x,
+                            const int subpel_y,
+                            const struct scale_factors *sf,
+                            int w, int h, int ref,
+                            const InterpKernel *kernel,
+                            int xs, int ys);
+
+MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
+                            const MODE_INFO *mi, int ref, int block);
+
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y);
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y);
+
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);

@@ -27,9 +47,6 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                   BLOCK_SIZE bsize);

-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const MV *mv_q3,
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -305,15 +305,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
 $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;

 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_neon_asm=vp9_convolve8_neon;

 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;

 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;

 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -402,25 +402,25 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {

 # variance
 add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x16/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x32/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x64/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance32x32 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -447,10 +447,10 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
 specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -477,10 +477,10 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_
 specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -653,7 +653,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const
 specialize qw/vp9_sad4x4x8 sse4/;

 add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
+specialize qw/vp9_sad64x64x4d sse2/;

 add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad32x64x4d sse2/;
@@ -668,7 +668,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, co
 specialize qw/vp9_sad16x32x4d sse2/;

 add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
+specialize qw/vp9_sad32x32x4d sse2/;

 add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad16x16x4d sse2/;
@@ -693,7 +693,7 @@ add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, cons
 specialize qw/vp9_sad4x4x4d sse/;

 add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 specialize qw/vp9_mse8x16/;
@@ -714,6 +714,9 @@ specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vp9_subtract_block/, "$sse2_x86inc";

+add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
+
 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

@@ -739,19 +742,31 @@ add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int strid
 specialize qw/vp9_fht8x8 sse2 avx2/;

 add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2 avx2/;
+specialize qw/vp9_fht16x16 sse2/;

 add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fwht4x4/, "$mmx_x86inc";

+add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct4x4_1 sse2/;
+
 add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct4x4 sse2 avx2/;

+add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct8x8_1 sse2/;
+
 add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";

+add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct16x16_1 sse2/;
+
 add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2 avx2/;
+specialize qw/vp9_fdct16x16 sse2/;
+
+add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct32x32_1 sse2/;

 add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct32x32 sse2 avx2/;
--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c
@@ -33,14 +33,6 @@ static int get_fixed_point_scale_factor(int other_size, int this_size) {
  return (other_size << REF_SCALE_SHIFT) / this_size;
 }

-static int check_scale_factors(int other_w, int other_h,
-                               int this_w, int this_h) {
-  return 2 * this_w >= other_w &&
-         2 * this_h >= other_h &&
-         this_w <= 16 * other_w &&
-         this_h <= 16 * other_h;
-}
-
 MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
@@ -54,7 +46,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                       int other_w, int other_h,
                                       int this_w, int this_h) {
-  if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
    sf->x_scale_fp = REF_INVALID_SCALE;
    sf->y_scale_fp = REF_INVALID_SCALE;
    return;
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -46,8 +46,16 @@ static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
 }

 static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_NO_SCALE ||
-         sf->y_scale_fp != REF_NO_SCALE;
+  return vp9_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                      int this_width, int this_height) {
+  return 2 * this_width >= ref_width &&
+         2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width &&
+         this_height <= 16 * ref_height;
 }

 #ifdef __cplusplus
--- a/vp9/common/vp9_thread.c
+++ b/vp9/common/vp9_thread.c
@@ -0,0 +1,183 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Original source:
+//  http://git.chromium.org/webm/libwebp.git
+//  100644 blob 08ad4e1fecba302bf1247645e84a7d2779956bc3  src/utils/thread.c
+
+#include <assert.h>
+#include <string.h>   // for memset()
+#include "./vp9_thread.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_MULTITHREAD
+
+struct VP9WorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+};
+
+//------------------------------------------------------------------------------
+
+static void execute(VP9Worker *const worker);  // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+  VP9Worker *const worker = (VP9Worker*)ptr;
+  int done = 0;
+  while (!done) {
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    while (worker->status_ == OK) {   // wait in idling mode
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    if (worker->status_ == WORK) {
+      execute(worker);
+      worker->status_ = OK;
+    } else if (worker->status_ == NOT_OK) {   // finish the worker
+      done = 1;
+    }
+    // signal to the main thread that we're done (for sync())
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+  }
+  return THREAD_RETURN(NULL);    // Thread is finished
+}
+
+// main thread state control
+static void change_state(VP9Worker *const worker,
+                         VP9WorkerStatus new_status) {
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;
+
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
+  }
+  pthread_mutex_unlock(&worker->impl_->mutex_);
+}
+
+#endif  // CONFIG_MULTITHREAD
+
+//------------------------------------------------------------------------------
+
+static void init(VP9Worker *const worker) {
+  memset(worker, 0, sizeof(*worker));
+  worker->status_ = NOT_OK;
+}
+
+static int sync(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, OK);
+#endif
+  assert(worker->status_ <= OK);
+  return !worker->had_error;
+}
+
+static int reset(VP9Worker *const worker) {
+  int ok = 1;
+  worker->had_error = 0;
+  if (worker->status_ < OK) {
+#if CONFIG_MULTITHREAD
+    worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
+      return 0;
+    }
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+    if (ok) worker->status_ = OK;
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+      vpx_free(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
+#else
+    worker->status_ = OK;
+#endif
+  } else if (worker->status_ > OK) {
+    ok = sync(worker);
+  }
+  assert(!ok || (worker->status_ == OK));
+  return ok;
+}
+
+static void execute(VP9Worker *const worker) {
+  if (worker->hook != NULL) {
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  }
+}
+
+static void launch(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, WORK);
+#else
+  execute(worker);
+#endif
+}
+
+static void end(VP9Worker *const worker) {
+  if (worker->status_ >= OK) {
+#if CONFIG_MULTITHREAD
+    change_state(worker, NOT_OK);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
+#else
+    worker->status_ = NOT_OK;
+#endif
+  }
+  vpx_free(worker->impl_);
+  worker->impl_ = NULL;
+  assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+static VP9WorkerInterface g_worker_interface = {
+  init, reset, sync, launch, execute, end
+};
+
+int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) {
+  if (winterface == NULL ||
+      winterface->init == NULL || winterface->reset == NULL ||
+      winterface->sync == NULL || winterface->launch == NULL ||
+      winterface->execute == NULL || winterface->end == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const VP9WorkerInterface *vp9_get_worker_interface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
--- a/vp9/decoder/vp9_thread.h
+++ b/vp9/decoder/vp9_thread.h
@@ -11,8 +11,7 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob 13a61a4c84194c3374080cbf03d881d3cd6af40d  src/utils/thread.h
-
+//  100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38  src/utils/thread.h

 #ifndef VP9_DECODER_VP9_THREAD_H_
 #define VP9_DECODER_VP9_THREAD_H_
@@ -163,40 +162,53 @@ typedef enum {
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*VP9WorkerHook)(void*, void*);

-// Synchronize object used to launch job in the worker thread
+// Platform-dependent implementation details for the worker.
+typedef struct VP9WorkerImpl VP9WorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
 typedef struct {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t mutex_;
-  pthread_cond_t  condition_;
-  pthread_t       thread_;
-#endif
+  VP9WorkerImpl *impl_;
  VP9WorkerStatus status_;
  VP9WorkerHook hook;     // hook to call
-  void* data1;            // first argument passed to 'hook'
-  void* data2;            // second argument passed to 'hook'
+  void *data1;            // first argument passed to 'hook'
+  void *data2;            // second argument passed to 'hook'
  int had_error;          // return value of the last call to 'hook'
 } VP9Worker;

-// Must be called first, before any other method.
-void vp9_worker_init(VP9Worker* const worker);
-// Must be called to initialize the object and spawn the thread. Re-entrant.
-// Will potentially launch the thread. Returns false in case of error.
-int vp9_worker_reset(VP9Worker* const worker);
-// Makes sure the previous work is finished. Returns true if worker->had_error
-// was not set and no error condition was triggered by the working thread.
-int vp9_worker_sync(VP9Worker* const worker);
-// Triggers the thread to call hook() with data1 and data2 argument. These
-// hook/data1/data2 can be changed at any time before calling this function,
-// but not be changed afterward until the next call to vp9_worker_sync().
-void vp9_worker_launch(VP9Worker* const worker);
-// This function is similar to vp9_worker_launch() except that it calls the
-// hook directly instead of using a thread. Convenient to bypass the thread
-// mechanism while still using the VP9Worker structs. vp9_worker_sync() must
-// still be called afterward (for error reporting).
-void vp9_worker_execute(VP9Worker* const worker);
-// Kill the thread and terminate the object. To use the object again, one
-// must call vp9_worker_reset() again.
-void vp9_worker_end(VP9Worker* const worker);
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*init)(VP9Worker *const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*reset)(VP9Worker *const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*sync)(VP9Worker *const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*launch)(VP9Worker *const worker);
+  // This function is similar to launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the VP9Worker structs. sync() must
+  // still be called afterward (for error reporting).
+  void (*execute)(VP9Worker *const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call reset() again.
+  void (*end)(VP9Worker *const worker);
+} VP9WorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int vp9_set_worker_interface(const VP9WorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const VP9WorkerInterface *vp9_get_worker_interface(void);

 //------------------------------------------------------------------------------

--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ b/vp9/common/x86/vp9_postproc_mmx.asm
@@ -464,7 +464,6 @@ sym(vp9_mbpost_proc_down_mmx):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int width, unsigned int height, int pitch)
-extern sym(rand)
 global sym(vp9_plane_add_noise_mmx) PRIVATE
 sym(vp9_plane_add_noise_mmx):
    push        rbp
@@ -476,7 +475,7 @@ sym(vp9_plane_add_noise_mmx):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -629,7 +629,6 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int width, unsigned int height, int pitch)
-extern sym(rand)
 global sym(vp9_plane_add_noise_wmt) PRIVATE
 sym(vp9_plane_add_noise_wmt):
    push        rbp
@@ -641,7 +640,7 @@ sym(vp9_plane_add_noise_wmt):
    ; end prolog

 .addnoise_loop:
-    call sym(rand) WRT_PLT
+    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -28,6 +28,7 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"

 #include "vp9/decoder/vp9_decodeframe.h"
@@ -38,7 +39,6 @@
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
-#include "vp9/decoder/vp9_thread.h"

 #define MAX_VP9_HEADER_SIZE 80

@@ -327,21 +327,24 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
  xd->block_refs[idx] = ref_buffer;
+
  if (!vp9_is_valid_scale(&ref_buffer->sf))
    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                       "Invalid scale factors");
  vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
                       &ref_buffer->sf);
-  xd->corrupted |= ref_buffer->buf->corrupted;
+  if (!cm->frame_parallel_decode)
+    xd->corrupted |= ref_buffer->buf->corrupted;
 }

-static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                         const TileInfo *const tile,
                         int mi_row, int mi_col,
                         vp9_reader *r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
  const int less8x8 = bsize < BLOCK_8X8;
  MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+  vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r);

  if (less8x8)
    bsize = BLOCK_8X8;
@@ -365,7 +368,7 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
      set_ref(cm, xd, 1, mi_row, mi_col);

    // Prediction
-    vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);

    // Reconstruction
    if (!mbmi->skip) {
@@ -404,41 +407,46 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
  return p;
 }

-static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
+  BLOCK_SIZE subsize, uv_subsize;

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
  subsize = get_subsize(bsize, partition);
+  uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y];
+  if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid block size.");
  if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
        break;
      case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
        if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
        break;
      case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
        if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
        break;
      case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, tile, mi_row,       mi_col,       r, subsize);
+        decode_partition(pbi, xd, tile, mi_row,       mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col,       r, subsize);
+        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
        break;
      default:
        assert(0 && "Invalid partition type");
@@ -617,6 +625,7 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
 }

 static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
+  BufferPool *const pool = cm->buffer_pool;
  if (cm->width != width || cm->height != height) {
    // Change in frame size.
    // TODO(agrange) Don't test width/height, check overall size.
@@ -633,14 +642,17 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
    vp9_update_frame_size(cm);
  }

+  lock_buffer_pool(pool);
  if (vp9_realloc_frame_buffer(
          get_frame_new_buffer(cm), cm->width, cm->height,
          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
+  unlock_buffer_pool(pool);
 }

 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -667,9 +679,17 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
  if (!found)
    read_frame_size(rb, &width, &height);

-  if (width <= 0 || height <= 0)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Referenced frame with invalid size");
+  // Check that each of the frames that this frame references has valid
+  // dimensions.
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    const int ref_width = ref_frame->buf->y_width;
+    const int ref_height = ref_frame->buf->y_height;
+
+    if (!valid_ref_frame_size(ref_width, ref_height, width, height))
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has invalid size");
+  }

  apply_frame_size(cm, width, height);
  setup_display_size(cm, rb);
@@ -685,6 +705,10 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
  while (max_ones-- && vp9_rb_read_bit(rb))
    cm->log2_tile_cols++;

+  if (cm->log2_tile_cols > 6)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid number of tile columns");
+
  // rows
  cm->log2_tile_rows = vp9_rb_read_bit(rb);
  if (cm->log2_tile_rows)
@@ -755,19 +779,20 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
                                   const uint8_t *data,
                                   const uint8_t *data_end) {
  VP9_COMMON *const cm = &pbi->common;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
  const int tile_cols = 1 << cm->log2_tile_cols;
  const int tile_rows = 1 << cm->log2_tile_rows;
  TileBuffer tile_buffers[4][1 << 6];
  int tile_row, tile_col;
-  int mi_row, mi_col;
+  int mi_row = 0, mi_col = 0;
  TileData *tile_data = NULL;

  if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) {
    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                    vpx_memalign(32, sizeof(LFWorkerData)));
    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
-    if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Loop filter thread creation failed");
    }
@@ -780,7 +805,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
    vp9_copy(lf_data->planes, pbi->mb.plane);
    lf_data->stop = 0;
    lf_data->y_only = 0;
-    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
  }

  assert(tile_rows <= 4);
@@ -838,7 +862,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        vp9_zero(tile_data->xd.left_seg_context);
        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
             mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
+          decode_partition(pbi, &tile_data->xd, &tile, mi_row, mi_col,
                           &tile_data->bit_reader, BLOCK_64X64);
        }
      }
@@ -853,30 +877,38 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        // decoding has completed: finish up the loop filter in this thread.
        if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;

-        vp9_worker_sync(&pbi->lf_worker);
+        winterface->sync(&pbi->lf_worker);
        lf_data->start = lf_start;
        lf_data->stop = mi_row;
        if (pbi->max_threads > 1) {
-          vp9_worker_launch(&pbi->lf_worker);
+          winterface->launch(&pbi->lf_worker);
        } else {
-          vp9_worker_execute(&pbi->lf_worker);
+          winterface->execute(&pbi->lf_worker);
        }
      }
+      // After loopfiltering, the last 7 row pixels in each superblock row may
+      // still be changed by the longest loopfilter of the next superblock
+      // row.
+      if (pbi->frame_parallel_decode)
+        vp9_frameworker_broadcast(pbi->cur_buf,
+                                  mi_row << MI_BLOCK_SIZE_LOG2);
    }
  }

  // Loopfilter remaining rows in the frame.
  if (cm->lf.filter_level) {
    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-    vp9_worker_sync(&pbi->lf_worker);
+    winterface->sync(&pbi->lf_worker);
    lf_data->start = lf_data->stop;
    lf_data->stop = cm->mi_rows;
-    vp9_worker_execute(&pbi->lf_worker);
+    winterface->execute(&pbi->lf_worker);
  }

  // Get last tile data.
  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;

+  if (pbi->frame_parallel_decode)
+    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
  return vp9_reader_find_end(&tile_data->bit_reader);
 }

@@ -891,7 +923,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
    vp9_zero(tile_data->xd.left_seg_context);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->cm, &tile_data->xd, tile,
+      decode_partition(tile_data->pbi, &tile_data->xd, tile,
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
    }
  }
@@ -915,6 +947,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
                                      const uint8_t *data,
                                      const uint8_t *data_end) {
  VP9_COMMON *const cm = &pbi->common;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  const uint8_t *bit_reader_end = NULL;
  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -941,11 +974,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
      VP9Worker *const worker = &pbi->tile_workers[i];
      ++pbi->num_tile_workers;

-      vp9_worker_init(worker);
+      winterface->init(worker);
      CHECK_MEM_ERROR(cm, worker->data1,
                      vpx_memalign(32, sizeof(TileWorkerData)));
      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
-      if (i < num_threads - 1 && !vp9_worker_reset(worker)) {
+      if (i < num_threads - 1 && !winterface->reset(worker)) {
        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                           "Tile decoder thread creation failed");
      }
@@ -996,10 +1029,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
      TileInfo *const tile = (TileInfo*)worker->data2;
      TileBuffer *const buf = &tile_buffers[0][n];

-      tile_data->cm = cm;
+      tile_data->pbi = pbi;
      tile_data->xd = pbi->mb;
      tile_data->xd.corrupted = 0;
-      vp9_tile_init(tile, tile_data->cm, 0, buf->col);
+      vp9_tile_init(tile, &pbi->common, 0, buf->col);
      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                          &tile_data->bit_reader, pbi->decrypt_cb,
                          pbi->decrypt_state);
@@ -1008,9 +1041,9 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,

      worker->had_error = 0;
      if (i == num_workers - 1 || n == tile_cols - 1) {
-        vp9_worker_execute(worker);
+        winterface->execute(worker);
      } else {
-        vp9_worker_launch(worker);
+        winterface->launch(worker);
      }

      if (buf->col == tile_cols - 1) {
@@ -1022,7 +1055,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,

    for (; i > 0; --i) {
      VP9Worker *const worker = &pbi->tile_workers[i - 1];
-      pbi->mb.corrupted |= !vp9_worker_sync(worker);
+      pbi->mb.corrupted |= !winterface->sync(worker);
    }
    if (final_worker > -1) {
      TileWorkerData *const tile_data =
@@ -1058,8 +1091,10 @@ static BITSTREAM_PROFILE read_profile(struct vp9_read_bit_buffer *rb) {
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                       struct vp9_read_bit_buffer *rb) {
  VP9_COMMON *const cm = &pbi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  BufferPool *const pool = pbi->common.buffer_pool;
+  int i, mask, ref_index = 0;
  size_t sz;
-  int i;

  cm->last_frame_type = cm->frame_type;

@@ -1076,16 +1111,22 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
  if (cm->show_existing_frame) {
    // Show an existing frame directly.
    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
-
-    if (cm->frame_bufs[frame_to_show].ref_count < 1)
+    lock_buffer_pool(pool);
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1)
      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                         "Buffer %d does not contain a decoded frame",
                         frame_to_show);

-    ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
    pbi->refresh_frame_flags = 0;
    cm->lf.filter_level = 0;
    cm->show_frame = 1;
+
+    if (pbi->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
    return 0;
  }

@@ -1125,6 +1166,10 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
    }

    setup_frame_size(cm, rb);
+    if (pbi->need_resync) {
+      vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
  } else {
    cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);

@@ -1136,17 +1181,20 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,

      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
      setup_frame_size(cm, rb);
+      if (pbi->need_resync) {
+        vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
    } else {
      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
-
      for (i = 0; i < REFS_PER_FRAME; ++i) {
        const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
        const int idx = cm->ref_frame_map[ref];
-        cm->frame_refs[i].idx = idx;
-        cm->frame_refs[i].buf = &cm->frame_bufs[idx].buf;
+        RefBuffer *const ref_frame = &cm->frame_refs[i];
+        ref_frame->idx = idx;
+        ref_frame->buf = &frame_bufs[idx].buf;
        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
      }
-
      setup_frame_size_with_refs(cm, rb);

      cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
@@ -1164,6 +1212,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
    }
  }

+  if (pbi->need_resync) {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
  if (!cm->error_resilient_mode) {
    cm->coding_use_prev_mi = 1;
    cm->refresh_frame_context = vp9_rb_read_bit(rb);
@@ -1178,6 +1232,30 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
  // below, forcing the use of context 0 for those frame types.
  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);

+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
    vp9_setup_past_independence(cm);

@@ -1322,7 +1400,9 @@ void vp9_decode_frame(VP9Decoder *pbi,
                      const uint8_t **p_data_end) {
  VP9_COMMON *const cm = &pbi->common;
  MACROBLOCKD *const xd = &pbi->mb;
-  struct vp9_read_bit_buffer rb = { 0 };
+  struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
+  int context_updated = 0;
+
  uint8_t clear_data[MAX_VP9_HEADER_SIZE];
  const size_t first_partition_size = read_uncompressed_header(pbi,
      init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
@@ -1359,6 +1439,28 @@ void vp9_decode_frame(VP9Decoder *pbi,
  xd->corrupted = 0;
  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);

+  if (cm->lf.filter_level) {
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
+
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    }
+    vp9_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  }
+
  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
  // single-frame tile decoding.
  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
@@ -1371,9 +1473,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
    *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
  }

-  new_fb->corrupted |= xd->corrupted;
-
-  if (!new_fb->corrupted) {
+  if (!xd->corrupted) {
    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
      vp9_adapt_coef_probs(cm);

@@ -1384,8 +1484,235 @@ void vp9_decode_frame(VP9Decoder *pbi,
    } else {
      debug_check_frame_counts(cm);
    }
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data is corrupted.");
  }

-  if (cm->refresh_frame_context)
+  // Non frame parallel update frame context here.
+  if (cm->refresh_frame_context && !context_updated)
    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
 }
+
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+
+void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                int plane, int block, int bw, int bh, int x,
+                                int y, int w, int h, int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  int ref;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+               ? average_split_mvs(pd, plane, mi, ref, block)
+               : mi->mbmi.mv[ref].as_mv;
+
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    MV32 scaled_mv;
+    int xs, ys, x0, y0, x0_16, y0_16, y1, frame_width, frame_height,
+        buf_stride, subpel_x, subpel_y;
+    uint8_t *ref_frame, *buf_ptr;
+    const int idx = xd->block_refs[ref]->idx;
+    BufferPool *const pool = pbi->common.buffer_pool;
+    RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+
+    // Get reference frame pointer, width and height.
+    if (plane == 0) {
+      frame_width = ref_frame_buf->buf.y_crop_width;
+      frame_height = ref_frame_buf->buf.y_crop_height;
+      ref_frame = ref_frame_buf->buf.y_buffer;
+    } else {
+      frame_width = ref_frame_buf->buf.uv_crop_width;
+      frame_height = ref_frame_buf->buf.uv_crop_height;
+      ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                           : ref_frame_buf->buf.v_buffer;
+    }
+
+    if (vp9_is_scaled(sf)) {
+      // Co-ordinate of containing block to pixel precision.
+      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+      // Co-ordinate of the block to 1/16th pixel precision.
+      x0_16 = (x_start + x) << SUBPEL_BITS;
+      y0_16 = (y_start + y) << SUBPEL_BITS;
+
+      // Co-ordinate of current block in reference frame
+      // to 1/16th pixel precision.
+      x0_16 = sf->scale_value_x(x0_16, sf);
+      y0_16 = sf->scale_value_y(y0_16, sf);
+
+      // Map the top left corner of the block into the reference frame.
+      x0 = sf->scale_value_x(x_start + x, sf);
+      y0 = sf->scale_value_y(y_start + y, sf);
+
+      // Scale the MV and incorporate the sub-pixel offset of the block
+      // in the reference frame.
+      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+      xs = sf->x_step_q4;
+      ys = sf->y_step_q4;
+    } else {
+      // Co-ordinate of containing block to pixel precision.
+      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+      // Co-ordinate of the block to 1/16th pixel precision.
+      x0_16 = x0 << SUBPEL_BITS;
+      y0_16 = y0 << SUBPEL_BITS;
+
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+    // Calculate the top left corner of the best matching block in the
+    // reference frame.
+    x0 += scaled_mv.col >> SUBPEL_BITS;
+    y0 += scaled_mv.row >> SUBPEL_BITS;
+    x0_16 += scaled_mv.col;
+    y0_16 += scaled_mv.row;
+
+    // Get reference block pointer.
+    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+    buf_stride = pre_buf->stride;
+
+    // Get reference block bottom right vertical coordinate.
+    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
+    // Do border extension if there is motion or the
+    // width/height is not a multiple of 8 pixels.
+    if (scaled_mv.col || scaled_mv.row ||
+        (frame_width & 0x7) || (frame_height & 0x7)) {
+      int x_pad = 0, y_pad = 0;
+
+      // Get reference block bottom right horizontal coordinate.
+      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+
+      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
+        x0 -= VP9_INTERP_EXTEND - 1;
+        x1 += VP9_INTERP_EXTEND;
+        x_pad = 1;
+      }
+
+      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
+        y0 -= VP9_INTERP_EXTEND - 1;
+        y1 += VP9_INTERP_EXTEND;
+        y_pad = 1;
+      }
+
+      // Wait until reference block is ready. Pad 7 more pixels as last 7
+      // pixels of each superblock row can be changed by next superblock row.
+       if (pbi->frame_parallel_decode)
+         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                              (y1 + 7) << (plane == 0 ? 0 : 1));
+
+      // Skip border extension if block is inside the frame.
+      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
+          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+        // Extend the border.
+        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1,
+                        x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width,
+                        frame_height);
+        buf_stride = x1 - x0 + 1;
+        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+      }
+    } else {
+      // Wait until reference block is ready. Pad 7 more pixels as last 7
+      // pixels of each superblock row can be changed by next superblock row.
+       if (pbi->frame_parallel_decode)
+         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                              (y1 + 7) << (plane == 0 ? 0 : 1));
+    }
+
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+}
+
+void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          dec_build_inter_predictors(pbi, xd, plane, i++, bw, bh,
+                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(pbi, xd, plane, 0, bw, bh,
+                                 0, 0, bw, bh, mi_x, mi_y);
+    }
+  }
+}
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h
@@ -25,6 +25,9 @@ void vp9_decode_frame(struct VP9Decoder *pbi,
                      const uint8_t *data, const uint8_t *data_end,
                      const uint8_t **p_data_end);

+void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
+                                       MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -96,7 +96,7 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,

  for (y = 0; y < ymis; y++)
    for (x = 0; x < xmis; x++)
-      cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }

 static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -129,8 +129,10 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,

  predicted_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
                                            bsize, mi_row, mi_col);
-  if (!seg->update_map)
+  if (!seg->update_map) {
+    set_segment_id(cm, bsize, mi_row, mi_col, predicted_segment_id);
    return predicted_segment_id;
+  }

  if (seg->temporal_update) {
    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
@@ -418,11 +420,18 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  }
 }

-static void read_inter_block_mode_info(VP9_COMMON *const cm,
+static void fpm_sync(void *const data, int mi_row) {
+  VP9Decoder *const pbi = (VP9Decoder *)data;
+  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->prev_buf,
+                       mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
                                       MODE_INFO *const mi,
                                       int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  const BLOCK_SIZE bsize = mbmi->sb_type;
  const int allow_hp = cm->allow_high_precision_mv;
@@ -436,7 +445,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
  for (ref = 0; ref < 1 + is_compound; ++ref) {
    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
    vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
-                     mi_row, mi_col);
+                     mi_row, mi_col, fpm_sync, (void *)pbi);
  }

  inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
@@ -510,10 +519,13 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
  }
 }

-static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+// TODO(hkuang): Pass cm instead of pbi. This requires change in
+// vp9_frameworker_wait.
+static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
                                       int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
  MODE_INFO *const mi = xd->mi[0];
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  int inter_block;
@@ -527,16 +539,17 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                               !mbmi->skip || !inter_block, r);

  if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r);
  else
    read_intra_block_mode_info(cm, mi, r);
 }

-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                        const TileInfo *const tile,
                        int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
  if (frame_is_intra_only(cm))
    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
  else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r);
 }
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -11,6 +11,7 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_

+#include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_reader.h"

 #ifdef __cplusplus
@@ -19,7 +20,7 @@ extern "C" {

 struct TileInfo;

-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                        const struct TileInfo *const tile,
                        int mi_row, int mi_col, vp9_reader *r);

--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -26,6 +26,7 @@
 #endif
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_thread.h"

 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_decoder.h"
@@ -37,12 +38,11 @@ static void initialize_dec() {

  if (!init_done) {
    vp9_init_neighbors();
-    vp9_init_quant_tables();
    init_done = 1;
  }
 }

-VP9Decoder *vp9_decoder_create() {
+VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
  VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;

@@ -58,15 +58,18 @@ VP9Decoder *vp9_decoder_create() {
  }

  cm->error.setjmp = 1;
+  pbi->need_resync = 1;
  initialize_dec();

  vp9_rtcd();

  // Initialize the references to not point to any frame buffers.
  vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));

  cm->current_video_frame = 0;
  pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;

  // vp9_init_dequantizer() is first called here. Add check in
  // frame_init_dequantizer() to avoid unnecessary calling of
@@ -77,7 +80,7 @@ VP9Decoder *vp9_decoder_create() {

  cm->error.setjmp = 0;

-  vp9_worker_init(&pbi->lf_worker);
+  vp9_get_worker_interface()->init(&pbi->lf_worker);

  return pbi;
 }
@@ -87,12 +90,12 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
  int i;

  vp9_remove_common(cm);
-  vp9_worker_end(&pbi->lf_worker);
+  vp9_get_worker_interface()->end(&pbi->lf_worker);
  vpx_free(pbi->lf_worker.data1);
  vpx_free(pbi->tile_data);
  for (i = 0; i < pbi->num_tile_workers; ++i) {
    VP9Worker *const worker = &pbi->tile_workers[i];
-    vp9_worker_end(worker);
+    vp9_get_worker_interface()->end(worker);
    vpx_free(worker->data1);
    vpx_free(worker->data2);
  }
@@ -125,7 +128,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
   */
  if (ref_frame_flag == VP9_LAST_FLAG) {
    const YV12_BUFFER_CONFIG *const cfg =
-        &cm->frame_bufs[cm->ref_frame_map[0]].buf;
+        &cm->buffer_pool->frame_bufs[cm->ref_frame_map[0]].buf;
    if (!equal_dimensions(cfg, sd))
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Incorrect buffer dimensions");
@@ -144,6 +147,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                      VP9_REFFRAME ref_frame_flag,
                                      YV12_BUFFER_CONFIG *sd) {
  RefBuffer *ref_buf = NULL;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
  // encoder is using the frame buffers for. This is just a stub to keep the
@@ -171,11 +175,11 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
    const int free_fb = get_free_fb(cm);
    // Decrease ref_count since it will be increased again in
    // ref_cnt_fb() below.
-    cm->frame_bufs[free_fb].ref_count--;
+    --frame_bufs[free_fb].ref_count;

    // Manage the reference counters and copy image.
-    ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf;
+    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
    vp8_yv12_copy_frame(sd, ref_buf->buf);
  }

@@ -185,11 +189,12 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,

 int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
  VP9_COMMON *cm = &pbi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  if (index < 0 || index >= REF_FRAMES)
    return -1;

-  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+  *fb = &frame_bufs[cm->ref_frame_map[index]].buf;
  return 0;
 }

@@ -197,21 +202,38 @@ int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
 static void swap_frame_buffers(VP9Decoder *pbi) {
  int ref_index = 0, mask;
  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

+  lock_buffer_pool(pool);
  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1) {
-      const int old_idx = cm->ref_frame_map[ref_index];
-      ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
-                 cm->new_fb_idx);
-      if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
-        cm->release_fb_cb(cm->cb_priv,
-                          &cm->frame_bufs[old_idx].raw_frame_buffer);
+    const int old_idx = cm->ref_frame_map[ref_index];
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame in reference map.
+    if ((mask & 1) && old_idx >= 0) {
+      decrease_ref_count(old_idx, frame_bufs, pool);
    }
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
    ++ref_index;
  }

+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 0;
  cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+
+  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
+    --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
+  }

  // Invalidate these references until the next frame starts.
  for (ref_index = 0; ref_index < 3; ref_index++)
@@ -221,9 +243,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
 int vp9_receive_compressed_data(VP9Decoder *pbi,
                                size_t size, const uint8_t **psource) {
  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  const uint8_t *source = *psource;
  int retcode = 0;
-
  cm->error.error_code = VPX_CODEC_OK;

  if (size == 0) {
@@ -239,27 +262,63 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
      cm->frame_refs[0].buf->corrupted = 1;
  }

+  pbi->ready_for_new_data = 0;
+
  // Check if the previous frame was a frame without any references to it.
-  if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
-    cm->release_fb_cb(cm->cb_priv,
-                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
  cm->new_fb_idx = get_free_fb(cm);

+  pbi->hold_ref_buf = 0;
+  if (pbi->frame_parallel_decode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    vp9_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
  if (setjmp(cm->error.jmp)) {
    cm->error.setjmp = 0;
+    pbi->ready_for_new_data = 1;

-    // We do not know if the missing frame(s) was supposed to update
-    // any of the reference buffers, but we act conservative and
-    // mark only the last buffer as corrupted.
-    //
-    // TODO(jkoleszar): Error concealment is undefined and non-normative
-    // at this point, but if it becomes so, [0] may not always be the correct
-    // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX)
-      cm->frame_refs[0].buf->corrupted = 1;
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      VP9_COMMON *const cm = &pbi->common;
+      BufferPool *const pool = cm->buffer_pool;
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);

-    if (cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
-      cm->frame_bufs[cm->new_fb_idx].ref_count--;
+        // Release the reference frame in reference map.
+        if ((mask & 1) && old_idx >= 0) {
+          decrease_ref_count(old_idx, frame_bufs, pool);
+        }
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);

    return -1;
  }
@@ -272,20 +331,39 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,

  vp9_clear_system_state();

-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
  if (!cm->show_existing_frame)
    cm->last_show_frame = cm->show_frame;
-  if (cm->show_frame) {
-    if (!cm->show_existing_frame)
-      vp9_swap_mi_and_prev_mi(cm);

-    cm->current_video_frame++;
+  // Update progress in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    vp9_frameworker_lock_stats(worker);
+
+    if (cm->show_frame) {
+      if (!cm->show_existing_frame)
+        vp9_swap_mi_and_prev_mi(cm);
+      cm->current_video_frame++;
+    }
+    vp9_swap_current_and_last_seg_map(cm);
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+    if (cm->show_frame) {
+      if (!cm->show_existing_frame)
+        vp9_swap_mi_and_prev_mi(cm);
+      cm->current_video_frame++;
+    }
+
+    vp9_swap_current_and_last_seg_map(cm);
  }

-  pbi->ready_for_new_data = 0;
-
  cm->error.setjmp = 0;
  return retcode;
 }
@@ -300,12 +378,12 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
  if (pbi->ready_for_new_data == 1)
    return ret;

+  pbi->ready_for_new_data = 1;
+
  /* no raw frame to show!!! */
  if (pbi->common.show_frame == 0)
    return ret;

-  pbi->ready_for_new_data = 1;
-
 #if CONFIG_VP9_POSTPROC
  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
 #else
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -18,10 +18,10 @@

 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
+#include "vp9/common/vp9_thread.h"

 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_thread.h"

 #ifdef __cplusplus
 extern "C" {
@@ -43,6 +43,14 @@ typedef struct VP9Decoder {

  int refresh_frame_flags;

+  int frame_parallel_decode;  // frame-based threading.
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
+  RefCntBuffer *prev_buf;  //  Previous decoding frame buffer.
+
+  VP9Worker *frame_worker_owner;   // frame_worker that owns this pbi.
  VP9Worker lf_worker;
  VP9Worker *tile_workers;
  int num_tile_workers;
@@ -57,6 +65,8 @@ typedef struct VP9Decoder {

  int max_threads;
  int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
 } VP9Decoder;

 int vp9_receive_compressed_data(struct VP9Decoder *pbi,
@@ -76,10 +86,25 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
 int vp9_get_reference_dec(struct VP9Decoder *pbi,
                          int index, YV12_BUFFER_CONFIG **fb);

-struct VP9Decoder *vp9_decoder_create();
+struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);

 void vp9_decoder_remove(struct VP9Decoder *pbi);

+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/Show More
+++ b/Show More