Adjust optimize_b RD parameters

Coding gain lowres 0.51% midres 0.36% Change-Id: I1e9f2f9341bad12d9023f97c73d0e991ae5ec7f0
Enable optimize_b for intra blocks
2016-05-06 09:56:59 -07:00 · 2016-05-06 09:55:45 -07:00 · 2016-05-03 14:42:17 -07:00 · 2016-05-02 19:15:13 +00:00 · 2016-05-01 12:25:57 -07:00 · 2016-04-26 22:08:20 +00:00
208 changed files with 74246 additions and 16440 deletions
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@ -511,9 +511,8 @@ process_common_cmdline() {
        ;;
        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
        ;;
-        --cpu)
-        ;;
-        --cpu=*) tune_cpu="$optval"
+        --cpu=*)
+        tune_cpu="$optval"
        ;;
        --extra-cflags=*)
        extra_cflags="${optval}"
@ -863,10 +862,6 @@ EOF
                    check_add_cflags -mfpu=neon #-ftree-vectorize
                    check_add_asflags -mfpu=neon
                fi
-
-                if [ -z "${tune_cpu}" ]; then
-                    tune_cpu=cortex-a8
-                fi
            else
                check_add_cflags -march=${tgt_isa}
                check_add_asflags -march=${tgt_isa}
--- a/25
+++ b/25
@ -26,6 +26,7 @@ Advanced options:
  ${toggle_unit_tests}            unit tests
  ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
  ${toggle_encode_perf_tests}     build encoder perf tests with unit tests
+  --cpu=CPU                       tune for the specified CPU (ARM: cortex-a8, X86: sse3)
  --libc=PATH                     path to alternate libc
  --size-limit=WxH                max size to allow in the decoder
  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
@ -282,6 +283,30 @@ EXPERIMENT_LIST="
    vp9_temporal_denoising
    fp_mb_stats
    emulate_hardware
+    tx64x64
+    filterintra
+    ext_tx
+    tx_skip
+    supertx
+    copy_mode
+    interintra
+    wedge_partition
+    global_motion
+    palette
+    new_quant
+    intrabc
+    loop_postfilter
+    row_tile
+    new_inter
+    bitstream_fixes
+    newmvref
+    misc_entropy
+    wavelets
+    ext_partition
+    qctx_tprobs
+    sr_mode
+    multi_ref
+    ext_coding_unit_size
 "
 CONFIG_LIST="
    external_build
--- a/examples.mk
+++ b/examples.mk
@ -35,20 +35,30 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                third_party/libyuv/source/scale_posix.cc \
                third_party/libyuv/source/scale_win.cc \

-LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
-                      third_party/libwebm/mkvmuxerutil.cpp \
-                      third_party/libwebm/mkvwriter.cpp \
-                      third_party/libwebm/mkvmuxer.hpp \
-                      third_party/libwebm/mkvmuxertypes.hpp \
-                      third_party/libwebm/mkvmuxerutil.hpp \
-                      third_party/libwebm/mkvparser.hpp \
-                      third_party/libwebm/mkvwriter.hpp \
-                      third_party/libwebm/webmids.hpp
+LIBWEBM_COMMON_SRCS += third_party/libwebm/common/hdr_util.cc \
+                       third_party/libwebm/common/hdr_util.h \
+                       third_party/libwebm/common/webmids.h

-LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
-                      third_party/libwebm/mkvreader.cpp \
-                      third_party/libwebm/mkvparser.hpp \
-                      third_party/libwebm/mkvreader.hpp
+LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer/mkvmuxer.cc \
+                      third_party/libwebm/mkvmuxer/mkvmuxerutil.cc \
+                      third_party/libwebm/mkvmuxer/mkvwriter.cc \
+                      third_party/libwebm/mkvmuxer/mkvmuxer.h \
+                      third_party/libwebm/mkvmuxer/mkvmuxertypes.h \
+                      third_party/libwebm/mkvmuxer/mkvmuxerutil.h \
+                      third_party/libwebm/mkvparser/mkvparser.h \
+                      third_party/libwebm/mkvmuxer/mkvwriter.h
+
+LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \
+                      third_party/libwebm/mkvparser/mkvreader.cc \
+                      third_party/libwebm/mkvparser/mkvparser.h \
+                      third_party/libwebm/mkvparser/mkvreader.h
+
+# Add compile flags and include path for libwebm sources.
+ifeq ($(CONFIG_WEBM_IO),yes)
+  CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+  CXXFLAGS     += -I$(SRC_PATH_BARE)/third_party/libwebm
+  INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
+endif

 # List of examples to build. UTILS are tools meant for distribution
 # while EXAMPLES demonstrate specific portions of the API.
@ -66,6 +76,8 @@ ifeq ($(CONFIG_LIBYUV),yes)
  vpxdec.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
+  vpxdec.SRCS                 += $(LIBWEBM_MUXER_SRCS)
  vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
  vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
@ -86,34 +98,18 @@ ifeq ($(CONFIG_LIBYUV),yes)
  vpxenc.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxenc.SRCS                 += $(LIBWEBM_COMMON_SRCS)
  vpxenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)
+  vpxenc.SRCS                 += $(LIBWEBM_PARSER_SRCS)
  vpxenc.SRCS                 += webmenc.cc webmenc.h
 endif
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
-ifeq ($(CONFIG_SPATIAL_SVC),yes)
-  EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
-  vp9_spatial_svc_encoder.SRCS        += args.c args.h
-  vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
-  vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
-  vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
-  vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
-  vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
-endif

 ifneq ($(CONFIG_SHARED),yes)
 EXAMPLES-$(CONFIG_VP9_ENCODER)    += resize_util.c
 endif

-EXAMPLES-$(CONFIG_ENCODERS)          += vpx_temporal_svc_encoder.c
-vpx_temporal_svc_encoder.SRCS        += ivfenc.c ivfenc.h
-vpx_temporal_svc_encoder.SRCS        += tools_common.c tools_common.h
-vpx_temporal_svc_encoder.SRCS        += video_common.h
-vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
-vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
-vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
 EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
 simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
 simple_decoder.SRCS                += ivfdec.h ivfdec.c
@ -186,7 +182,13 @@ vp8cx_set_ref.SRCS                 += video_common.h
 vp8cx_set_ref.SRCS                 += video_writer.h video_writer.c
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
-
+EXAMPLES-$(CONFIG_VP9_ENCODER)     += vp9cx_set_ref.c
+vp9cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
+vp9cx_set_ref.SRCS                 += tools_common.h tools_common.c
+vp9cx_set_ref.SRCS                 += video_common.h
+vp9cx_set_ref.SRCS                 += video_writer.h video_writer.c
+vp9cx_set_ref.GUID                  = 65D7F14A-2EE6-4293-B958-AB5107A03B55
+vp9cx_set_ref.DESCRIPTION           = VP9 set encoder reference frame

 ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
 ifeq ($(CONFIG_LIBYUV),yes)
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@ -1,439 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-/*
- * This is an example demonstrating how to implement a multi-layer
- * VP9 encoding scheme based on spatial scalability for video applications
- * that benefit from a scalable bitstream.
- */
-
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-#include "./args.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
-
-#include "vpx/svc_context.h"
-#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
-#include "./vpxstats.h"
-
-static const arg_def_t skip_frames_arg =
-    ARG_DEF("s", "skip-frames", 1, "input frames to skip");
-static const arg_def_t frames_arg =
-    ARG_DEF("f", "frames", 1, "number of frames to encode");
-static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "source width");
-static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "source height");
-static const arg_def_t timebase_arg =
-    ARG_DEF("t", "timebase", 1, "timebase (num/den)");
-static const arg_def_t bitrate_arg = ARG_DEF(
-    "b", "target-bitrate", 1, "encoding bitrate, in kilobits per second");
-static const arg_def_t spatial_layers_arg =
-    ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers");
-static const arg_def_t temporal_layers_arg =
-    ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers");
-static const arg_def_t kf_dist_arg =
-    ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
-static const arg_def_t scale_factors_arg =
-    ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
-static const arg_def_t passes_arg =
-    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
-static const arg_def_t pass_arg =
-    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
-static const arg_def_t fpf_name_arg =
-    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-static const arg_def_t min_q_arg =
-    ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
-static const arg_def_t max_q_arg =
-    ARG_DEF(NULL, "max-q", 1, "Maximum quantizer");
-static const arg_def_t min_bitrate_arg =
-    ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate");
-static const arg_def_t max_bitrate_arg =
-    ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static const struct arg_enum_list bitdepth_enum[] = {
-  {"8",  VPX_BITS_8},
-  {"10", VPX_BITS_10},
-  {"12", VPX_BITS_12},
-  {NULL, 0}
-};
-
-static const arg_def_t bitdepth_arg =
-    ARG_DEF_ENUM("d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ",
-                 bitdepth_enum);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-
-static const arg_def_t *svc_args[] = {
-  &frames_arg,        &width_arg,         &height_arg,
-  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &spatial_layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
-  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
-  &max_bitrate_arg,   &temporal_layers_arg,
-#if CONFIG_VP9_HIGHBITDEPTH
-  &bitdepth_arg,
-#endif
-  NULL
-};
-
-static const uint32_t default_frames_to_skip = 0;
-static const uint32_t default_frames_to_code = 60 * 60;
-static const uint32_t default_width = 1920;
-static const uint32_t default_height = 1080;
-static const uint32_t default_timebase_num = 1;
-static const uint32_t default_timebase_den = 60;
-static const uint32_t default_bitrate = 1000;
-static const uint32_t default_spatial_layers = 5;
-static const uint32_t default_temporal_layers = 1;
-static const uint32_t default_kf_dist = 100;
-
-typedef struct {
-  const char *input_filename;
-  const char *output_filename;
-  uint32_t frames_to_code;
-  uint32_t frames_to_skip;
-  struct VpxInputContext input_ctx;
-  stats_io_t rc_stats;
-  int passes;
-  int pass;
-} AppInput;
-
-static const char *exec_name;
-
-void usage_exit() {
-  fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
-          exec_name);
-  fprintf(stderr, "Options:\n");
-  arg_show_usage(stderr, svc_args);
-  exit(EXIT_FAILURE);
-}
-
-static void parse_command_line(int argc, const char **argv_,
-                               AppInput *app_input, SvcContext *svc_ctx,
-                               vpx_codec_enc_cfg_t *enc_cfg) {
-  struct arg arg = {0};
-  char **argv = NULL;
-  char **argi = NULL;
-  char **argj = NULL;
-  vpx_codec_err_t res;
-  int passes = 0;
-  int pass = 0;
-  const char *fpf_file_name = NULL;
-  unsigned int min_bitrate = 0;
-  unsigned int max_bitrate = 0;
-  char string_options[1024] = {0};
-
-  // initialize SvcContext with parameters that will be passed to vpx_svc_init
-  svc_ctx->log_level = SVC_LOG_DEBUG;
-  svc_ctx->spatial_layers = default_spatial_layers;
-  svc_ctx->temporal_layers = default_temporal_layers;
-
-  // start with default encoder configuration
-  res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
-  if (res) {
-    die("Failed to get config: %s\n", vpx_codec_err_to_string(res));
-  }
-  // update enc_cfg with app default values
-  enc_cfg->g_w = default_width;
-  enc_cfg->g_h = default_height;
-  enc_cfg->g_timebase.num = default_timebase_num;
-  enc_cfg->g_timebase.den = default_timebase_den;
-  enc_cfg->rc_target_bitrate = default_bitrate;
-  enc_cfg->kf_min_dist = default_kf_dist;
-  enc_cfg->kf_max_dist = default_kf_dist;
-  enc_cfg->rc_end_usage = VPX_CQ;
-
-  // initialize AppInput with default values
-  app_input->frames_to_code = default_frames_to_code;
-  app_input->frames_to_skip = default_frames_to_skip;
-
-  // process command line options
-  argv = argv_dup(argc - 1, argv_ + 1);
-  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
-    arg.argv_step = 1;
-
-    if (arg_match(&arg, &frames_arg, argi)) {
-      app_input->frames_to_code = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &width_arg, argi)) {
-      enc_cfg->g_w = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &height_arg, argi)) {
-      enc_cfg->g_h = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &timebase_arg, argi)) {
-      enc_cfg->g_timebase = arg_parse_rational(&arg);
-    } else if (arg_match(&arg, &bitrate_arg, argi)) {
-      enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &skip_frames_arg, argi)) {
-      app_input->frames_to_skip = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
-      svc_ctx->spatial_layers = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
-      svc_ctx->temporal_layers = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &kf_dist_arg, argi)) {
-      enc_cfg->kf_min_dist = arg_parse_uint(&arg);
-      enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
-    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
-               string_options, arg.val);
-    } else if (arg_match(&arg, &passes_arg, argi)) {
-      passes = arg_parse_uint(&arg);
-      if (passes < 1 || passes > 2) {
-        die("Error: Invalid number of passes (%d)\n", passes);
-      }
-    } else if (arg_match(&arg, &pass_arg, argi)) {
-      pass = arg_parse_uint(&arg);
-      if (pass < 1 || pass > 2) {
-        die("Error: Invalid pass selected (%d)\n", pass);
-      }
-    } else if (arg_match(&arg, &fpf_name_arg, argi)) {
-      fpf_file_name = arg.val;
-    } else if (arg_match(&arg, &min_q_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s",
-               string_options, arg.val);
-    } else if (arg_match(&arg, &max_q_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s",
-               string_options, arg.val);
-    } else if (arg_match(&arg, &min_bitrate_arg, argi)) {
-      min_bitrate = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
-      max_bitrate = arg_parse_uint(&arg);
-#if CONFIG_VP9_HIGHBITDEPTH
-    } else if (arg_match(&arg, &bitdepth_arg, argi)) {
-      enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
-      switch (enc_cfg->g_bit_depth) {
-        case VPX_BITS_8:
-          enc_cfg->g_input_bit_depth = 8;
-          enc_cfg->g_profile = 0;
-          break;
-        case VPX_BITS_10:
-          enc_cfg->g_input_bit_depth = 10;
-          enc_cfg->g_profile = 2;
-          break;
-         case VPX_BITS_12:
-          enc_cfg->g_input_bit_depth = 12;
-          enc_cfg->g_profile = 2;
-          break;
-        default:
-          die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
-          break;
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    } else {
-      ++argj;
-    }
-  }
-
-  // There will be a space in front of the string options
-  if (strlen(string_options) > 0)
-    vpx_svc_set_options(svc_ctx, string_options + 1);
-
-  if (passes == 0 || passes == 1) {
-    if (pass) {
-      fprintf(stderr, "pass is ignored since there's only one pass\n");
-    }
-    enc_cfg->g_pass = VPX_RC_ONE_PASS;
-  } else {
-    if (pass == 0) {
-      die("pass must be specified when passes is 2\n");
-    }
-
-    if (fpf_file_name == NULL) {
-      die("fpf must be specified when passes is 2\n");
-    }
-
-    if (pass == 1) {
-      enc_cfg->g_pass = VPX_RC_FIRST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) {
-        fatal("Failed to open statistics store");
-      }
-    } else {
-      enc_cfg->g_pass = VPX_RC_LAST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) {
-        fatal("Failed to open statistics store");
-      }
-      enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats);
-    }
-    app_input->passes = passes;
-    app_input->pass = pass;
-  }
-
-  if (enc_cfg->rc_target_bitrate > 0) {
-    if (min_bitrate > 0) {
-      enc_cfg->rc_2pass_vbr_minsection_pct =
-          min_bitrate * 100 / enc_cfg->rc_target_bitrate;
-    }
-    if (max_bitrate > 0) {
-      enc_cfg->rc_2pass_vbr_maxsection_pct =
-          max_bitrate * 100 / enc_cfg->rc_target_bitrate;
-    }
-  }
-
-  // Check for unrecognized options
-  for (argi = argv; *argi; ++argi)
-    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
-      die("Error: Unrecognized option %s\n", *argi);
-
-  if (argv[0] == NULL || argv[1] == 0) {
-    usage_exit();
-  }
-  app_input->input_filename = argv[0];
-  app_input->output_filename = argv[1];
-  free(argv);
-
-  if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
-      enc_cfg->g_h % 2)
-    die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
-
-  printf(
-      "Codec %s\nframes: %d, skip: %d\n"
-      "layers: %d\n"
-      "width %d, height: %d,\n"
-      "num: %d, den: %d, bitrate: %d,\n"
-      "gop size: %d\n",
-      vpx_codec_iface_name(vpx_codec_vp9_cx()), app_input->frames_to_code,
-      app_input->frames_to_skip,
-      svc_ctx->spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
-      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
-      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
-}
-
-int main(int argc, const char **argv) {
-  AppInput app_input = {0};
-  VpxVideoWriter *writer = NULL;
-  VpxVideoInfo info = {0};
-  vpx_codec_ctx_t codec;
-  vpx_codec_enc_cfg_t enc_cfg;
-  SvcContext svc_ctx;
-  uint32_t i;
-  uint32_t frame_cnt = 0;
-  vpx_image_t raw;
-  vpx_codec_err_t res;
-  int pts = 0;            /* PTS starts at 0 */
-  int frame_duration = 1; /* 1 timebase tick per frame */
-  FILE *infile = NULL;
-  int end_of_stream = 0;
-  int frames_received = 0;
-
-  memset(&svc_ctx, 0, sizeof(svc_ctx));
-  svc_ctx.log_print = 1;
-  exec_name = argv[0];
-  parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);
-
-  // Allocate image buffer
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ?
-                         VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
-                     enc_cfg.g_w, enc_cfg.g_h, 32)) {
-    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
-  }
-#else
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
-    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  if (!(infile = fopen(app_input.input_filename, "rb")))
-    die("Failed to open %s for reading\n", app_input.input_filename);
-
-  // Initialize codec
-  if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) !=
-      VPX_CODEC_OK)
-    die("Failed to initialize encoder\n");
-
-  info.codec_fourcc = VP9_FOURCC;
-  info.time_base.numerator = enc_cfg.g_timebase.num;
-  info.time_base.denominator = enc_cfg.g_timebase.den;
-
-  if (!(app_input.passes == 2 && app_input.pass == 1)) {
-    // We don't save the bitstream for the 1st pass on two pass rate control
-    writer = vpx_video_writer_open(app_input.output_filename, kContainerIVF,
-                                   &info);
-    if (!writer)
-      die("Failed to open %s for writing\n", app_input.output_filename);
-  }
-
-  // skip initial frames
-  for (i = 0; i < app_input.frames_to_skip; ++i)
-    vpx_img_read(&raw, infile);
-
-  // Encode frames
-  while (!end_of_stream) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-    if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
-      // We need one extra vpx_svc_encode call at end of stream to flush
-      // encoder and get remaining data
-      end_of_stream = 1;
-    }
-
-    res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
-                         pts, frame_duration, VPX_DL_GOOD_QUALITY);
-    printf("%s", vpx_svc_get_message(&svc_ctx));
-    if (res != VPX_CODEC_OK) {
-      die_codec(&codec, "Failed to encode frame");
-    }
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
-      switch (cx_pkt->kind) {
-        case VPX_CODEC_CX_FRAME_PKT: {
-          if (cx_pkt->data.frame.sz > 0)
-            vpx_video_writer_write_frame(writer,
-                                         cx_pkt->data.frame.buf,
-                                         cx_pkt->data.frame.sz,
-                                         cx_pkt->data.frame.pts);
-
-          printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
-                 !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
-                 (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-          ++frames_received;
-          break;
-        }
-        case VPX_CODEC_STATS_PKT: {
-          stats_write(&app_input.rc_stats,
-                      cx_pkt->data.twopass_stats.buf,
-                      cx_pkt->data.twopass_stats.sz);
-          break;
-        }
-        default: {
-          break;
-        }
-      }
-    }
-
-    if (!end_of_stream) {
-      ++frame_cnt;
-      pts += frame_duration;
-    }
-  }
-
-  printf("Processed %d frames\n", frame_cnt);
-
-  fclose(infile);
-  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
-
-  if (app_input.passes == 2)
-    stats_close(&app_input.rc_stats, 1);
-
-  if (writer) {
-    vpx_video_writer_close(writer);
-  }
-
-  vpx_img_free(&raw);
-
-  // display average size, psnr
-  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
-
-  vpx_svc_release(&svc_ctx);
-
-  return EXIT_SUCCESS;
-}
--- a/examples/vp9cx_set_ref.c
+++ b/examples/vp9cx_set_ref.c
@ -0,0 +1,443 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+// VP9 Set Reference Frame
+// =======================
+//
+// This is an example demonstrating how to overwrite the VP9 encoder's
+// internal reference frame. In the sample we set the last frame to the
+// current frame. This technique could be used to bounce between two cameras.
+//
+// The decoder would also have to set the reference frame to the same value
+// on the same frame, or the video will become corrupt. The 'test_decode'
+// variable is set to 1 in this example that tests if the encoder and decoder
+// results are matching.
+//
+// Usage
+// -----
+// This example encodes a raw video. And the last argument passed in specifies
+// the frame number to update the reference frame on.
+// The parameter is parsed as follows:
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the frame number passed on the command line
+// in the `update_frame_num` variable.
+//
+//
+// Configuration
+// -------------
+//
+// The reference frame is updated on the frame specified on the command
+// line.
+//
+// Observing The Effects
+// ---------------------
+// The encoder and decoder results should be matching when the same reference
+// frame setting operation is done in both encoder and decoder. Otherwise,
+// the encoder/decoder mismatch would be seen.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vpx_encoder.h"
+
+#include "./tools_common.h"
+#include "./video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> "
+          "<frame>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int compare_img(const vpx_image_t *const img1,
+                       const vpx_image_t *const img2) {
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  uint32_t i;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+
+  for (i = 0; i < img1->d_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     l_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     c_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     c_w) == 0);
+
+  return match;
+}
+
+#define mmin(a, b)  ((a) < (b) ? (a) : (b))
+static void find_mismatch(const vpx_image_t *const img1,
+                          const vpx_image_t *const img2,
+                          int yloc[4], int uloc[4], int vloc[4]) {
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_Y] +
+                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
+              *(img2->planes[VPX_PLANE_Y] +
+                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
+                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
+            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
+                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_U] +
+                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
+              *(img2->planes[VPX_PLANE_U] +
+                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(img1->planes[VPX_PLANE_U] +
+                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
+            uloc[3] = *(img2->planes[VPX_PLANE_U] +
+                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_V] +
+                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
+              *(img2->planes[VPX_PLANE_V] +
+                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(img1->planes[VPX_PLANE_V] +
+                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
+            vloc[3] = *(img2->planes[VPX_PLANE_V] +
+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void testing_decode(vpx_codec_ctx_t *encoder,
+                           vpx_codec_ctx_t *decoder,
+                           vpx_codec_enc_cfg_t *cfg,
+                           unsigned int frame_out,
+                           int *mismatch_seen) {
+  vpx_image_t enc_img, dec_img;
+  struct vp9_ref_frame ref_enc, ref_dec;
+
+  if (*mismatch_seen)
+    return;
+
+  ref_enc.idx = 0;
+  ref_dec.idx = 0;
+  if (vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc))
+    die_codec(encoder,  "Failed to get encoder reference frame");
+  enc_img = ref_enc.img;
+  if (vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec))
+    die_codec(decoder, "Failed to get decoder reference frame");
+  dec_img = ref_dec.img;
+
+  if (!compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+
+    *mismatch_seen = 1;
+
+    find_mismatch(&enc_img, &dec_img, y, u, v);
+    printf("Encode/decode mismatch on frame %d at"
+           " Y[%d, %d] {%d/%d},"
+           " U[%d, %d] {%d/%d},"
+           " V[%d, %d] {%d/%d}",
+           frame_out,
+           y[0], y[1], y[2], y[3],
+           u[0], u[1], u[2], u[3],
+           v[0], v[1], v[2], v[3]);
+  }
+
+  vpx_img_free(&enc_img);
+  vpx_img_free(&dec_img);
+}
+
+static int encode_frame(vpx_codec_ctx_t *ecodec,
+                        vpx_codec_enc_cfg_t *cfg,
+                        vpx_image_t *img,
+                        unsigned int frame_in,
+                        VpxVideoWriter *writer,
+                        int test_decode,
+                        vpx_codec_ctx_t *dcodec,
+                        unsigned int *frame_out,
+                        int *mismatch_seen) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  int got_data;
+  const vpx_codec_err_t res = vpx_codec_encode(ecodec, img, frame_in, 1,
+                                               0, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(ecodec, "Failed to encode frame");
+
+  got_data = 0;
+
+  while ((pkt = vpx_codec_get_cx_data(ecodec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+
+      if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
+                *frame_out += 1;
+        }
+
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(ecodec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+      got_data = 1;
+
+      // Decode 1 frame.
+      if (test_decode) {
+        if (vpx_codec_decode(dcodec, pkt->data.frame.buf,
+                             (unsigned int)pkt->data.frame.sz, NULL, 0))
+          die_codec(dcodec, "Failed to decode frame.");
+      }
+    }
+  }
+
+  // Mismatch checking
+  if (got_data && test_decode) {
+    testing_decode(ecodec, dcodec, cfg, *frame_out, mismatch_seen);
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  // Encoder
+  vpx_codec_ctx_t ecodec = {0};
+  vpx_codec_enc_cfg_t cfg = {0};
+  unsigned int frame_in = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+
+  // Test encoder/decoder mismatch.
+  int test_decode = 1;
+  // Decoder
+  vpx_codec_ctx_t dcodec;
+  unsigned int frame_out = 0;
+
+  // The frame number to set reference frame on
+  int update_frame_num = 0;
+  int mismatch_seen = 0;
+
+  const int fps = 30;
+  const int bitrate = 500;
+
+  const char *codec_used = "vp9";
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  exec_name = argv[0];
+
+  if (argc != 6)
+    die("Invalid number of arguments");
+
+  width_arg = argv[1];
+  height_arg = argv[2];
+  infile_arg = argv[3];
+  outfile_arg = argv[4];
+
+  encoder = get_vpx_encoder_by_name(codec_used);
+  if (!encoder)
+    die("Unsupported codec.");
+
+  update_frame_num = atoi(argv[5]);
+  if (update_frame_num <= 0)
+    die("Couldn't parse frame number '%s'\n", argv[5]);
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(width_arg, NULL, 0);
+  info.frame_height = strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&ecodec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_lag_in_frames = 25;
+
+  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (vpx_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&ecodec, "Failed to initialize encoder");
+
+  // Disable alt_ref.
+  if (vpx_codec_control(&ecodec, VP8E_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&ecodec, "Failed to set enable auto alt ref");
+
+  if (test_decode) {
+      const VpxInterface *decoder = get_vpx_decoder_by_name(codec_used);
+      if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
+        die_codec(&dcodec, "Failed to initialize decoder.");
+  }
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    // In VP9, the reference buffers (cm->frame_buffs[i].buf) are allocated
+    // while calling vpx_codec_encode(), thus, setting reference for 1st frame
+    // isn't supported.
+    if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
+      vpx_ref_frame_t ref;
+      ref.frame_type = VP8_LAST_FRAME;
+      ref.img = raw;
+      // Set reference frame in encoder.
+      if (vpx_codec_control(&ecodec, VP8_SET_REFERENCE, &ref))
+        die_codec(&ecodec, "Failed to set reference frame");
+
+      // If set_reference in decoder is commented out, the enc/dec mismatch
+      // would be seen.
+      if (test_decode) {
+        if (vpx_codec_control(&dcodec, VP8_SET_REFERENCE, &ref))
+          die_codec(&dcodec, "Failed to set reference frame");
+      }
+    }
+
+    encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode,
+                 &dcodec, &frame_out, &mismatch_seen);
+    frame_in++;
+    if (mismatch_seen)
+      break;
+  }
+
+  // Flush encoder.
+  if (!mismatch_seen)
+    while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode,
+                        &dcodec, &frame_out, &mismatch_seen)) {};
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_out);
+
+  if (test_decode) {
+    if (!mismatch_seen)
+      printf("Encoder/decoder results are matching.\n");
+    else
+      printf("Encoder/decoder results are NOT matching.\n");
+  }
+
+  if (test_decode)
+    if (vpx_codec_destroy(&dcodec))
+      die_codec(&dcodec, "Failed to destroy decoder");
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&ecodec))
+    die_codec(&ecodec, "Failed to destroy encoder.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@ -1,733 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-//  This is an example demonstrating how to implement a multi-layer VPx
-//  encoding scheme based on temporal scalability for video applications
-//  that benefit from a scalable bitstream.
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/vpx_timer.h"
-#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
-
-#include "./tools_common.h"
-#include "./video_writer.h"
-
-static const char *exec_name;
-
-void usage_exit() {
-  exit(EXIT_FAILURE);
-}
-
-// Denoiser states, for temporal denoising.
-enum denoiserState {
-  kDenoiserOff,
-  kDenoiserOnYOnly,
-  kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive,
-  kDenoiserOnAdaptive
-};
-
-static int mode_to_num_layers[12] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3};
-
-// For rate control encoding stats.
-struct RateControlMetrics {
-  // Number of input frames per layer.
-  int layer_input_frames[VPX_TS_MAX_LAYERS];
-  // Total (cumulative) number of encoded frames per layer.
-  int layer_tot_enc_frames[VPX_TS_MAX_LAYERS];
-  // Number of encoded non-key frames per layer.
-  int layer_enc_frames[VPX_TS_MAX_LAYERS];
-  // Framerate per layer layer (cumulative).
-  double layer_framerate[VPX_TS_MAX_LAYERS];
-  // Target average frame size per layer (per-frame-bandwidth per layer).
-  double layer_pfb[VPX_TS_MAX_LAYERS];
-  // Actual average frame size per layer.
-  double layer_avg_frame_size[VPX_TS_MAX_LAYERS];
-  // Average rate mismatch per layer (|target - actual| / target).
-  double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
-  // Actual encoding bitrate per layer (cumulative).
-  double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
-};
-
-// Note: these rate control metrics assume only 1 key frame in the
-// sequence (i.e., first frame only). So for temporal pattern# 7
-// (which has key frame for every frame on base layer), the metrics
-// computation will be off/wrong.
-// TODO(marpan): Update these metrics to account for multiple key frames
-// in the stream.
-static void set_rate_control_metrics(struct RateControlMetrics *rc,
-                                     vpx_codec_enc_cfg_t *cfg) {
-  unsigned int i = 0;
-  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
-  // per-frame-bandwidth, for the rate control encoding stats below.
-  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
-  rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
-  rc->layer_pfb[0] = 1000.0 * cfg->ts_target_bitrate[0] /
-      rc->layer_framerate[0];
-  for (i = 0; i < cfg->ts_number_layers; ++i) {
-    if (i > 0) {
-      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] = 1000.0 *
-          (cfg->ts_target_bitrate[i] - cfg->ts_target_bitrate[i - 1]) /
-          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
-    }
-    rc->layer_input_frames[i] = 0;
-    rc->layer_enc_frames[i] = 0;
-    rc->layer_tot_enc_frames[i] = 0;
-    rc->layer_encoding_bitrate[i] = 0.0;
-    rc->layer_avg_frame_size[i] = 0.0;
-    rc->layer_avg_rate_mismatch[i] = 0.0;
-  }
-}
-
-static void printout_rate_control_summary(struct RateControlMetrics *rc,
-                                          vpx_codec_enc_cfg_t *cfg,
-                                          int frame_cnt) {
-  unsigned int i = 0;
-  int tot_num_frames = 0;
-  printf("Total number of processed frames: %d\n\n", frame_cnt -1);
-  printf("Rate control layer stats for %d layer(s):\n\n",
-      cfg->ts_number_layers);
-  for (i = 0; i < cfg->ts_number_layers; ++i) {
-    const int num_dropped = (i > 0) ?
-        (rc->layer_input_frames[i] - rc->layer_enc_frames[i]) :
-        (rc->layer_input_frames[i] - rc->layer_enc_frames[i] - 1);
-    tot_num_frames += rc->layer_input_frames[i];
-    rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[i] *
-        rc->layer_encoding_bitrate[i] / tot_num_frames;
-    rc->layer_avg_frame_size[i] = rc->layer_avg_frame_size[i] /
-        rc->layer_enc_frames[i];
-    rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] /
-        rc->layer_enc_frames[i];
-    printf("For layer#: %d \n", i);
-    printf("Bitrate (target vs actual): %d %f \n", cfg->ts_target_bitrate[i],
-           rc->layer_encoding_bitrate[i]);
-    printf("Average frame size (target vs actual): %f %f \n", rc->layer_pfb[i],
-           rc->layer_avg_frame_size[i]);
-    printf("Average rate_mismatch: %f \n", rc->layer_avg_rate_mismatch[i]);
-    printf("Number of input frames, encoded (non-key) frames, "
-        "and perc dropped frames: %d %d %f \n", rc->layer_input_frames[i],
-        rc->layer_enc_frames[i],
-        100.0 * num_dropped / rc->layer_input_frames[i]);
-    printf("\n");
-  }
-  if ((frame_cnt - 1) != tot_num_frames)
-    die("Error: Number of input frames not equal to output! \n");
-}
-
-// Temporal scaling parameters:
-// NOTE: The 3 prediction frames cannot be used interchangeably due to
-// differences in the way they are handled throughout the code. The
-// frames should be allocated to layers in the order LAST, GF, ARF.
-// Other combinations work, but may produce slightly inferior results.
-static void set_temporal_layer_pattern(int layering_mode,
-                                       vpx_codec_enc_cfg_t *cfg,
-                                       int *layer_flags,
-                                       int *flag_periodicity) {
-  switch (layering_mode) {
-    case 0: {
-      // 1-layer.
-      int ids[1] = {0};
-      cfg->ts_periodicity = 1;
-      *flag_periodicity = 1;
-      cfg->ts_number_layers = 1;
-      cfg->ts_rate_decimator[0] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // Update L only.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_UPD_GF |
-          VP8_EFLAG_NO_UPD_ARF;
-      break;
-    }
-    case 1: {
-      // 2-layers, 2-frame period.
-      int ids[2] = {0, 1};
-      cfg->ts_periodicity = 2;
-      *flag_periodicity = 2;
-      cfg->ts_number_layers = 2;
-      cfg->ts_rate_decimator[0] = 2;
-      cfg->ts_rate_decimator[1] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-#if 1
-      // 0=L, 1=GF, Intra-layer prediction enabled.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_UPD_GF |
-          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
-      layer_flags[1] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_REF_ARF;
-#else
-       // 0=L, 1=GF, Intra-layer prediction disabled.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_UPD_GF |
-          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
-      layer_flags[1] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_LAST;
-#endif
-      break;
-    }
-    case 2: {
-      // 2-layers, 3-frame period.
-      int ids[3] = {0, 1, 1};
-      cfg->ts_periodicity = 3;
-      *flag_periodicity = 3;
-      cfg->ts_number_layers = 2;
-      cfg->ts_rate_decimator[0] = 3;
-      cfg->ts_rate_decimator[1] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, Intra-layer prediction enabled.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
-          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[1] =
-      layer_flags[2] = VP8_EFLAG_NO_REF_GF  | VP8_EFLAG_NO_REF_ARF |
-          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      break;
-    }
-    case 3: {
-      // 3-layers, 6-frame period.
-      int ids[6] = {0, 2, 2, 1, 2, 2};
-      cfg->ts_periodicity = 6;
-      *flag_periodicity = 6;
-      cfg->ts_number_layers = 3;
-      cfg->ts_rate_decimator[0] = 6;
-      cfg->ts_rate_decimator[1] = 3;
-      cfg->ts_rate_decimator[2] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
-          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[3] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_ARF |
-          VP8_EFLAG_NO_UPD_LAST;
-      layer_flags[1] =
-      layer_flags[2] =
-      layer_flags[4] =
-      layer_flags[5] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
-      break;
-    }
-    case 4: {
-      // 3-layers, 4-frame period.
-      int ids[4] = {0, 2, 1, 2};
-      cfg->ts_periodicity = 4;
-      *flag_periodicity = 4;
-      cfg->ts_number_layers = 3;
-      cfg->ts_rate_decimator[0] = 4;
-      cfg->ts_rate_decimator[1] = 2;
-      cfg->ts_rate_decimator[2] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
-          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
-          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      layer_flags[1] =
-      layer_flags[3] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      break;
-    }
-    case 5: {
-      // 3-layers, 4-frame period.
-      int ids[4] = {0, 2, 1, 2};
-      cfg->ts_periodicity = 4;
-      *flag_periodicity = 4;
-      cfg->ts_number_layers     = 3;
-      cfg->ts_rate_decimator[0] = 4;
-      cfg->ts_rate_decimator[1] = 2;
-      cfg->ts_rate_decimator[2] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled in layer 1, disabled
-      // in layer 2.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
-          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[2] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[1] =
-      layer_flags[3] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      break;
-    }
-    case 6: {
-      // 3-layers, 4-frame period.
-      int ids[4] = {0, 2, 1, 2};
-      cfg->ts_periodicity = 4;
-      *flag_periodicity = 4;
-      cfg->ts_number_layers = 3;
-      cfg->ts_rate_decimator[0] = 4;
-      cfg->ts_rate_decimator[1] = 2;
-      cfg->ts_rate_decimator[2] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
-          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[2] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[1] =
-      layer_flags[3] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
-      break;
-    }
-    case 7: {
-      // NOTE: Probably of academic interest only.
-      // 5-layers, 16-frame period.
-      int ids[16] = {0, 4, 3, 4, 2, 4, 3, 4, 1, 4, 3, 4, 2, 4, 3, 4};
-      cfg->ts_periodicity = 16;
-      *flag_periodicity = 16;
-      cfg->ts_number_layers = 5;
-      cfg->ts_rate_decimator[0] = 16;
-      cfg->ts_rate_decimator[1] = 8;
-      cfg->ts_rate_decimator[2] = 4;
-      cfg->ts_rate_decimator[3] = 2;
-      cfg->ts_rate_decimator[4] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      layer_flags[0]  = VPX_EFLAG_FORCE_KF;
-      layer_flags[1]  =
-      layer_flags[3]  =
-      layer_flags[5]  =
-      layer_flags[7]  =
-      layer_flags[9]  =
-      layer_flags[11] =
-      layer_flags[13] =
-      layer_flags[15] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
-          VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[2]  =
-      layer_flags[6]  =
-      layer_flags[10] =
-      layer_flags[14] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF;
-      layer_flags[4] =
-      layer_flags[12] = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[8]  = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF;
-      break;
-    }
-    case 8: {
-      // 2-layers, with sync point at first frame of layer 1.
-      int ids[2] = {0, 1};
-      cfg->ts_periodicity = 2;
-      *flag_periodicity = 8;
-      cfg->ts_number_layers = 2;
-      cfg->ts_rate_decimator[0] = 2;
-      cfg->ts_rate_decimator[1] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF.
-      // ARF is used as predictor for all frames, and is only updated on
-      // key frame. Sync point every 8 frames.
-
-      // Layer 0: predict from L and ARF, update L and G.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
-          VP8_EFLAG_NO_UPD_ARF;
-      // Layer 1: sync point: predict from L and ARF, and update G.
-      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_ARF;
-      // Layer 0, predict from L and ARF, update L.
-      layer_flags[2] = VP8_EFLAG_NO_REF_GF  | VP8_EFLAG_NO_UPD_GF |
-          VP8_EFLAG_NO_UPD_ARF;
-      // Layer 1: predict from L, G and ARF, and update G.
-      layer_flags[3] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_ENTROPY;
-      // Layer 0.
-      layer_flags[4] = layer_flags[2];
-      // Layer 1.
-      layer_flags[5] = layer_flags[3];
-      // Layer 0.
-      layer_flags[6] = layer_flags[4];
-      // Layer 1.
-      layer_flags[7] = layer_flags[5];
-     break;
-    }
-    case 9: {
-      // 3-layers: Sync points for layer 1 and 2 every 8 frames.
-      int ids[4] = {0, 2, 1, 2};
-      cfg->ts_periodicity = 4;
-      *flag_periodicity = 8;
-      cfg->ts_number_layers = 3;
-      cfg->ts_rate_decimator[0] = 4;
-      cfg->ts_rate_decimator[1] = 2;
-      cfg->ts_rate_decimator[2] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, 2=ARF.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
-          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
-          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
-      layer_flags[2] = VP8_EFLAG_NO_REF_GF   | VP8_EFLAG_NO_REF_ARF |
-          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[3] =
-      layer_flags[5] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
-      layer_flags[4] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
-          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[6] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_ARF;
-      layer_flags[7] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
-          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_ENTROPY;
-      break;
-    }
-    case 10: {
-      // 3-layers structure where ARF is used as predictor for all frames,
-      // and is only updated on key frame.
-      // Sync points for layer 1 and 2 every 8 frames.
-
-      int ids[4] = {0, 2, 1, 2};
-      cfg->ts_periodicity = 4;
-      *flag_periodicity = 8;
-      cfg->ts_number_layers = 3;
-      cfg->ts_rate_decimator[0] = 4;
-      cfg->ts_rate_decimator[1] = 2;
-      cfg->ts_rate_decimator[2] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, 2=ARF.
-      // Layer 0: predict from L and ARF; update L and G.
-      layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_UPD_ARF |
-          VP8_EFLAG_NO_REF_GF;
-      // Layer 2: sync point: predict from L and ARF; update none.
-      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF |
-          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
-          VP8_EFLAG_NO_UPD_ENTROPY;
-      // Layer 1: sync point: predict from L and ARF; update G.
-      layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_ARF |
-          VP8_EFLAG_NO_UPD_LAST;
-      // Layer 2: predict from L, G, ARF; update none.
-      layer_flags[3] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ENTROPY;
-      // Layer 0: predict from L and ARF; update L.
-      layer_flags[4] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-          VP8_EFLAG_NO_REF_GF;
-      // Layer 2: predict from L, G, ARF; update none.
-      layer_flags[5] = layer_flags[3];
-      // Layer 1: predict from L, G, ARF; update G.
-      layer_flags[6] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      // Layer 2: predict from L, G, ARF; update none.
-      layer_flags[7] = layer_flags[3];
-      break;
-    }
-    case 11:
-    default: {
-      // 3-layers structure as in case 10, but no sync/refresh points for
-      // layer 1 and 2.
-      int ids[4] = {0, 2, 1, 2};
-      cfg->ts_periodicity = 4;
-      *flag_periodicity = 8;
-      cfg->ts_number_layers = 3;
-      cfg->ts_rate_decimator[0] = 4;
-      cfg->ts_rate_decimator[1] = 2;
-      cfg->ts_rate_decimator[2] = 1;
-      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
-      // 0=L, 1=GF, 2=ARF.
-      // Layer 0: predict from L and ARF; update L.
-      layer_flags[0] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-          VP8_EFLAG_NO_REF_GF;
-      layer_flags[4] = layer_flags[0];
-      // Layer 1: predict from L, G, ARF; update G.
-      layer_flags[2] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      layer_flags[6] = layer_flags[2];
-      // Layer 2: predict from L, G, ARF; update none.
-      layer_flags[1] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ENTROPY;
-      layer_flags[3] = layer_flags[1];
-      layer_flags[5] = layer_flags[1];
-      layer_flags[7] = layer_flags[1];
-      break;
-    }
-  }
-}
-
-int main(int argc, char **argv) {
-  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
-  vpx_codec_ctx_t codec;
-  vpx_codec_enc_cfg_t cfg;
-  int frame_cnt = 0;
-  vpx_image_t raw;
-  vpx_codec_err_t res;
-  unsigned int width;
-  unsigned int height;
-  int speed;
-  int frame_avail;
-  int got_data;
-  int flags = 0;
-  unsigned int i;
-  int pts = 0;  // PTS starts at 0.
-  int frame_duration = 1;  // 1 timebase tick per frame.
-  int layering_mode = 0;
-  int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
-  int flag_periodicity = 1;
-  vpx_svc_layer_id_t layer_id = {0, 0};
-  const VpxInterface *encoder = NULL;
-  FILE *infile = NULL;
-  struct RateControlMetrics rc;
-  int64_t cx_time = 0;
-  const int min_args_base = 11;
-#if CONFIG_VP9_HIGHBITDEPTH
-  vpx_bit_depth_t bit_depth = VPX_BITS_8;
-  int input_bit_depth = 8;
-  const int min_args = min_args_base + 1;
-#else
-  const int min_args = min_args_base;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  exec_name = argv[0];
-  // Check usage and arguments.
-  if (argc < min_args) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
-        "<Rate_0> ... <Rate_nlayers-1> <bit-depth> \n", argv[0]);
-#else
-    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
-        "<Rate_0> ... <Rate_nlayers-1> \n", argv[0]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  }
-
-  encoder = get_vpx_encoder_by_name(argv[3]);
-  if (!encoder)
-    die("Unsupported codec.");
-
-  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
-
-  width = strtol(argv[4], NULL, 0);
-  height = strtol(argv[5], NULL, 0);
-  if (width < 16 || width % 2 || height < 16 || height % 2) {
-    die("Invalid resolution: %d x %d", width, height);
-  }
-
-  layering_mode = strtol(argv[10], NULL, 0);
-  if (layering_mode < 0 || layering_mode > 12) {
-    die("Invalid layering mode (0..12) %s", argv[10]);
-  }
-
-  if (argc != min_args + mode_to_num_layers[layering_mode]) {
-    die("Invalid number of arguments");
-  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  switch (strtol(argv[argc-1], NULL, 0)) {
-    case 8:
-      bit_depth = VPX_BITS_8;
-      input_bit_depth = 8;
-      break;
-    case 10:
-      bit_depth = VPX_BITS_10;
-      input_bit_depth = 10;
-      break;
-    case 12:
-      bit_depth = VPX_BITS_12;
-      input_bit_depth = 12;
-      break;
-    default:
-      die("Invalid bit depth (8, 10, 12) %s", argv[argc-1]);
-  }
-  if (!vpx_img_alloc(&raw,
-                     bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 :
-                                               VPX_IMG_FMT_I42016,
-                     width, height, 32)) {
-    die("Failed to allocate image", width, height);
-  }
-#else
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
-    die("Failed to allocate image", width, height);
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  // Populate encoder configuration.
-  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res) {
-    printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
-    return EXIT_FAILURE;
-  }
-
-  // Update the default configuration with our settings.
-  cfg.g_w = width;
-  cfg.g_h = height;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (bit_depth != VPX_BITS_8) {
-    cfg.g_bit_depth = bit_depth;
-    cfg.g_input_bit_depth = input_bit_depth;
-    cfg.g_profile = 2;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
-  cfg.g_timebase.num = strtol(argv[6], NULL, 0);
-  cfg.g_timebase.den = strtol(argv[7], NULL, 0);
-
-  speed = strtol(argv[8], NULL, 0);
-  if (speed < 0) {
-    die("Invalid speed setting: must be positive");
-  }
-
-  for (i = min_args_base;
-       (int)i < min_args_base + mode_to_num_layers[layering_mode];
-       ++i) {
-    cfg.ts_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
-  }
-
-  // Real time parameters.
-  cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
-  cfg.rc_end_usage = VPX_CBR;
-  cfg.rc_resize_allowed = 0;
-  cfg.rc_min_quantizer = 2;
-  cfg.rc_max_quantizer = 56;
-  cfg.rc_undershoot_pct = 50;
-  cfg.rc_overshoot_pct = 50;
-  cfg.rc_buf_initial_sz = 500;
-  cfg.rc_buf_optimal_sz = 600;
-  cfg.rc_buf_sz = 1000;
-
-  // Enable error resilient mode.
-  cfg.g_error_resilient = 1;
-  cfg.g_lag_in_frames   = 0;
-  cfg.kf_mode = VPX_KF_AUTO;
-
-  // Disable automatic keyframe placement.
-  cfg.kf_min_dist = cfg.kf_max_dist = 3000;
-
-  set_temporal_layer_pattern(layering_mode,
-                             &cfg,
-                             layer_flags,
-                             &flag_periodicity);
-
-  set_rate_control_metrics(&rc, &cfg);
-
-  // Target bandwidth for the whole stream.
-  // Set to ts_target_bitrate for highest layer (total bitrate).
-  cfg.rc_target_bitrate = cfg.ts_target_bitrate[cfg.ts_number_layers - 1];
-
-  // Open input file.
-  if (!(infile = fopen(argv[1], "rb"))) {
-    die("Failed to open %s for reading", argv[1]);
-  }
-
-  // Open an output file for each stream.
-  for (i = 0; i < cfg.ts_number_layers; ++i) {
-    char file_name[PATH_MAX];
-    VpxVideoInfo info;
-    info.codec_fourcc = encoder->fourcc;
-    info.frame_width = cfg.g_w;
-    info.frame_height = cfg.g_h;
-    info.time_base.numerator = cfg.g_timebase.num;
-    info.time_base.denominator = cfg.g_timebase.den;
-
-    snprintf(file_name, sizeof(file_name), "%s_%d.ivf", argv[2], i);
-    outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info);
-    if (!outfile[i])
-      die("Failed to open %s for writing", file_name);
-
-    assert(outfile[i] != NULL);
-  }
-  // No spatial layers in this encoder.
-  cfg.ss_number_layers = 1;
-
-  // Initialize codec.
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (vpx_codec_enc_init(
-          &codec, encoder->codec_interface(), &cfg,
-          bit_depth == VPX_BITS_8 ? 0 : VPX_CODEC_USE_HIGHBITDEPTH))
-#else
-  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    die_codec(&codec, "Failed to initialize encoder");
-
-  if (strncmp(encoder->name, "vp8", 3) == 0) {
-    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
-  } else if (strncmp(encoder->name, "vp9", 3) == 0) {
-      vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
-      vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
-      vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-      vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
-      if (vpx_codec_control(&codec, VP9E_SET_SVC, 1)) {
-        die_codec(&codec, "Failed to set SVC");
-    }
-  }
-  vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
-  vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1);
-  // This controls the maximum target size of the key frame.
-  // For generating smaller key frames, use a smaller max_intra_size_pct
-  // value, like 100 or 200.
-  {
-    const int max_intra_size_pct = 200;
-    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
-                      max_intra_size_pct);
-  }
-
-  frame_avail = 1;
-  while (frame_avail || got_data) {
-    struct vpx_usec_timer timer;
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *pkt;
-    // Update the temporal layer_id. No spatial layers in this test.
-    layer_id.spatial_layer_id = 0;
-    layer_id.temporal_layer_id =
-        cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
-    if (strncmp(encoder->name, "vp9", 3) == 0) {
-      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
-    }
-    flags = layer_flags[frame_cnt % flag_periodicity];
-    frame_avail = vpx_img_read(&raw, infile);
-    if (frame_avail)
-      ++rc.layer_input_frames[layer_id.temporal_layer_id];
-    vpx_usec_timer_start(&timer);
-    if (vpx_codec_encode(&codec, frame_avail? &raw : NULL, pts, 1, flags,
-        VPX_DL_REALTIME)) {
-      die_codec(&codec, "Failed to encode frame");
-    }
-    vpx_usec_timer_mark(&timer);
-    cx_time += vpx_usec_timer_elapsed(&timer);
-    // Reset KF flag.
-    if (layering_mode != 7) {
-      layer_flags[0] &= ~VPX_EFLAG_FORCE_KF;
-    }
-    got_data = 0;
-    while ( (pkt = vpx_codec_get_cx_data(&codec, &iter)) ) {
-      got_data = 1;
-      switch (pkt->kind) {
-        case VPX_CODEC_CX_FRAME_PKT:
-          for (i = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
-              i < cfg.ts_number_layers; ++i) {
-            vpx_video_writer_write_frame(outfile[i], pkt->data.frame.buf,
-                                         pkt->data.frame.sz, pts);
-            ++rc.layer_tot_enc_frames[i];
-            rc.layer_encoding_bitrate[i] += 8.0 * pkt->data.frame.sz;
-            // Keep count of rate control stats per layer (for non-key frames).
-            if (i == cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity] &&
-                !(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
-              rc.layer_avg_frame_size[i] += 8.0 * pkt->data.frame.sz;
-              rc.layer_avg_rate_mismatch[i] +=
-                  fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[i]) /
-                  rc.layer_pfb[i];
-              ++rc.layer_enc_frames[i];
-            }
-          }
-          break;
-          default:
-            break;
-      }
-    }
-    ++frame_cnt;
-    pts += frame_duration;
-  }
-  fclose(infile);
-  printout_rate_control_summary(&rc, &cfg, frame_cnt);
-  printf("\n");
-  printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
-          frame_cnt,
-          1000 * (float)cx_time / (double)(frame_cnt * 1000000),
-          1000000 * (double)frame_cnt / (double)cx_time);
-
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec");
-
-  // Try to rewrite the output file headers with the actual frame count.
-  for (i = 0; i < cfg.ts_number_layers; ++i)
-    vpx_video_writer_close(outfile[i]);
-
-  vpx_img_free(&raw);
-  return EXIT_SUCCESS;
-}
--- a/libs.doxy_template
+++ b/libs.doxy_template
@ -415,12 +415,6 @@ MAX_INITIALIZER_LINES  = 30

 SHOW_USED_FILES        = YES

-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES       = NO
-
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from the
 # version control system). Doxygen will invoke the program by executing (via
@ -715,12 +709,6 @@ HTML_FOOTER            =

 HTML_STYLESHEET        =

-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
 # If the GENERATE_HTMLHELP tag is set to YES, additional index files
 # will be generated that can be used as input for tools like the
 # Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
--- a/libs.mk
+++ b/libs.mk
@ -115,7 +115,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
-  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
  CODEC_DOC_SECTIONS += vp9 vp9_encoder
@ -386,6 +385,12 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

+#
+# Add include path for libwebm sources.
+#
+ifeq ($(CONFIG_WEBM_IO),yes)
+  CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/libwebm
+endif

 ##
 ## libvpx test directives
@ -459,6 +464,7 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_
            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
+            $(if $(CONFIG_WEBM_IO),-I"$(SRC_PATH_BARE)/third_party/libwebm") \
            -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^

 PROJECTS-$(CONFIG_MSVS) += test_libvpx.$(VCPROJ_SFX)
--- a/test/vp8cx_set_ref.sh
+++ b/test/vp8cx_set_ref.sh
@ -8,30 +8,27 @@
 ##  in the file PATENTS.  All contributing project authors may
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
-##  This file tests the libvpx vp8cx_set_ref example. To add new tests to this
+##  This file tests the libvpx cx_set_ref example. To add new tests to this
 ##  file, do the following:
 ##    1. Write a shell function (this is your test).
-##    2. Add the function to vp8cx_set_ref_tests (on a new line).
+##    2. Add the function to cx_set_ref_tests (on a new line).
 ##
 . $(dirname $0)/tools_common.sh

 # Environment check: $YUV_RAW_INPUT is required.
-vp8cx_set_ref_verify_environment() {
+cx_set_ref_verify_environment() {
  if [ ! -e "${YUV_RAW_INPUT}" ]; then
    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
    return 1
  fi
 }

-# Runs vp8cx_set_ref and updates the reference frame before encoding frame 90.
-# $1 is the codec name, which vp8cx_set_ref does not support at present: It's
-# currently used only to name the output file.
-# TODO(tomfinegan): Pass the codec param once the example is updated to support
-# VP9.
+# Runs cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name.
 vpx_set_ref() {
-  local encoder="${LIBVPX_BIN_PATH}/vp8cx_set_ref${VPX_TEST_EXE_SUFFIX}"
  local codec="$1"
-  local output_file="${VPX_TEST_OUTPUT_DIR}/vp8cx_set_ref_${codec}.ivf"
+  local encoder="${LIBVPX_BIN_PATH}/${codec}cx_set_ref${VPX_TEST_EXE_SUFFIX}"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/${codec}cx_set_ref_${codec}.ivf"
  local ref_frame_num=90

  if [ ! -x "${encoder}" ]; then
@ -46,12 +43,18 @@ vpx_set_ref() {
  [ -e "${output_file}" ] || return 1
 }

-vp8cx_set_ref_vp8() {
+cx_set_ref_vp8() {
  if [ "$(vp8_encode_available)" = "yes" ]; then
    vpx_set_ref vp8 || return 1
  fi
 }

-vp8cx_set_ref_tests="vp8cx_set_ref_vp8"
+cx_set_ref_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_set_ref vp9 || return 1
+  fi
+}

-run_tests vp8cx_set_ref_verify_environment "${vp8cx_set_ref_tests}"
+cx_set_ref_tests="cx_set_ref_vp8 cx_set_ref_vp9"
+
+run_tests cx_set_ref_verify_environment "${cx_set_ref_tests}"
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@ -218,442 +218,5 @@ TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
  }
 }

-class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
- public:
-  DatarateTestVP9Large() : EncoderTest(GET_PARAM(0)) {}
-
- protected:
-  virtual ~DatarateTestVP9Large() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-    ResetModel();
-  }
-
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    tot_frame_number_ = 0;
-    first_drop_ = 0;
-    num_drops_ = 0;
-    // Denoiser is off by default.
-    denoiser_on_ = 0;
-    // For testing up to 3 layers.
-    for (int i = 0; i < 3; ++i) {
-      bits_total_[i] = 0;
-    }
-  }
-
-  //
-  // Frame flags and layer id for temporal layers.
-  //
-
-  // For two layers, test pattern is:
-  //   1     3
-  // 0    2     .....
-  // For three layers, test pattern is:
-  //   1      3    5      7
-  //      2           6
-  // 0          4            ....
-  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
-  // For this 3 layer example, the 2nd enhancement layer (layer 2) does not
-  // update any reference frames.
-  int SetFrameFlags(int frame_num, int num_temp_layers) {
-    int frame_flags = 0;
-    if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        // Layer 0: predict from L and ARF, update L.
-        frame_flags = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF |
-                      VP8_EFLAG_NO_UPD_ARF;
-      } else {
-        // Layer 1: predict from L, G and ARF, and update G.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
-                      VP8_EFLAG_NO_UPD_ENTROPY;
-      }
-    } else if (num_temp_layers == 3) {
-      if (frame_num % 4 == 0) {
-        // Layer 0: predict from L and ARF; update L.
-        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-                      VP8_EFLAG_NO_REF_GF;
-      } else if ((frame_num - 2) % 4 == 0) {
-        // Layer 1: predict from L, G, ARF; update G.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      }  else if ((frame_num - 1) % 2 == 0) {
-        // Layer 2: predict from L, G, ARF; update none.
-        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-                      VP8_EFLAG_NO_UPD_LAST;
-      }
-    }
-    return frame_flags;
-  }
-
-  int SetLayerId(int frame_num, int num_temp_layers) {
-    int layer_id = 0;
-    if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        layer_id = 0;
-      } else {
-        layer_id = 1;
-      }
-    } else if (num_temp_layers == 3) {
-      if (frame_num % 4 == 0) {
-        layer_id = 0;
-      } else if ((frame_num - 2) % 4 == 0) {
-        layer_id = 1;
-      } else if ((frame_num - 1) % 2 == 0) {
-        layer_id = 2;
-      }
-    }
-    return layer_id;
-  }
-
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
-    }
-    if (cfg_.ts_number_layers > 1) {
-      if (video->frame() == 1) {
-        encoder->Control(VP9E_SET_SVC, 1);
-      }
-      vpx_svc_layer_id_t layer_id = {0, 0};
-      layer_id.spatial_layer_id = 0;
-      frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id = SetLayerId(video->frame(),
-                                              cfg_.ts_number_layers);
-      if (video->frame() > 0) {
-       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
-      }
-    }
-    const vpx_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-
-    if (duration > 1) {
-      // If first drop not set and we have a drop set it to this time.
-      if (!first_drop_)
-        first_drop_ = last_pts_ + 1;
-      // Update the number of frame drops.
-      num_drops_ += static_cast<int>(duration - 1);
-      // Update counter for total number of frames (#frames input to encoder).
-      // Needed for setting the proper layer_id below.
-      tot_frame_number_ += static_cast<int>(duration - 1);
-    }
-
-    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
-
-    // Add to the buffer the bits we'd expect from a constant bitrate server.
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-
-    // Buffer should not go negative.
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-        << pkt->data.frame.pts;
-
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-
-    // Update the total encoded bits. For temporal layers, update the cumulative
-    // encoded bits per layer.
-    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
-      bits_total_[i] += frame_size_in_bits;
-    }
-
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-    ++frame_number_;
-    ++tot_frame_number_;
-  }
-
-  virtual void EndPassHook(void) {
-    for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
-        ++layer) {
-      duration_ = (last_pts_ + 1) * timebase_;
-      if (bits_total_[layer]) {
-        // Effective file datarate:
-        effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
-      }
-    }
-  }
-
-  vpx_codec_pts_t last_pts_;
-  double timebase_;
-  int frame_number_;      // Counter for number of non-dropped/encoded frames.
-  int tot_frame_number_;  // Counter for total number of input frames.
-  int64_t bits_total_[3];
-  double duration_;
-  double effective_datarate_[3];
-  int set_cpu_used_;
-  int64_t bits_in_buffer_model_;
-  vpx_codec_pts_t first_drop_;
-  int num_drops_;
-  int denoiser_on_;
-};
-
-// Check basic rate targeting,
-TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int i = 150; i < 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting,
-TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
-  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
-
-  cfg_.g_profile = 1;
-  cfg_.g_timebase = video.timebase();
-
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-
-  for (int i = 250; i < 900; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
-              effective_datarate_[0] * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
-              effective_datarate_[0] * 1.15)
-        << " The datarate for the file missed the target!"
-        << cfg_.rc_target_bitrate << " "<< effective_datarate_;
-  }
-}
-
-// Check that (1) the first dropped frame gets earlier and earlier
-// as the drop frame threshold is increased, and (2) that the total number of
-// frame drops does not decrease as we increase frame drop threshold.
-// Use a lower qp-max to force some frame drops.
-TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 50;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = 140;
-  int last_num_drops = 0;
-  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops)
-        << " The number of dropped frames for drop_thresh " << i
-        << " < number of dropped frames for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-    last_num_drops = num_drops_;
-  }
-}
-
-// Check basic rate targeting for 2 temporal layers.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 2;
-  cfg_.ts_rate_decimator[0] = 2;
-  cfg_.ts_rate_decimator[1] = 1;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    // 60-40 bitrate allocation for 2 temporal layers.
-    cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
-          << " The datarate for the file is lower than target by too much, "
-              "for layer: " << j;
-      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
-          << " The datarate for the file is greater than target by too much, "
-              "for layer: " << j;
-    }
-  }
-}
-
-// Check basic rate targeting for 3 temporal layers.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    // 40-20-40 bitrate allocation for 3 temporal layers.
-    cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      // TODO(yaowu): Work out more stable rc control strategy and
-      //              Adjust the thresholds to be tighter than .75.
-      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.75)
-          << " The datarate for the file is lower than target by too much, "
-              "for layer: " << j;
-      // TODO(yaowu): Work out more stable rc control strategy and
-      //              Adjust the thresholds to be tighter than 1.25.
-      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.25)
-          << " The datarate for the file is greater than target by too much, "
-              "for layer: " << j;
-    }
-  }
-}
-
-// Check basic rate targeting for 3 temporal layers, with frame dropping.
-// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
-// frame drop threshold, to force frame dropping.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
-  cfg_.rc_dropframe_thresh = 20;
-  cfg_.rc_max_quantizer = 45;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  // 40-20-40 bitrate allocation for 3 temporal layers.
-  cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-  cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-  cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-    ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
-        << " The datarate for the file is lower than target by too much, "
-            "for layer: " << j;
-    ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
-        << " The datarate for the file is greater than target by too much, "
-            "for layer: " << j;
-    // Expect some frame drops in this test: for this 200 frames test,
-    // expect at least 10% and not more than 60% drops.
-    ASSERT_GE(num_drops_, 20);
-    ASSERT_LE(num_drops_, 130);
-  }
-}
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
-// Check basic datarate targeting, for a single bitrate, when denoiser is on.
-TEST_P(DatarateTestVP9Large, DenoiserLevels) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // Turn on the denoiser.
-  denoiser_on_ = 1;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-#endif  // CONFIG_VP9_TEMPORAL_DENOISING
-
 VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
-                          ::testing::Values(::libvpx_test::kOnePassGood,
-                          ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 7));
 }  // namespace
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@ -264,6 +264,8 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

 typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
+    Idct16x16Param;

 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                   int /*tx_type*/) {
@ -311,6 +313,32 @@ void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
 }
+
+void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_c(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+}
+
+void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+}
+#endif
 #endif

 class Trans16x16TestBase {
@ -540,7 +568,7 @@ class Trans16x16TestBase {

      reference_16x16_dct_2d(in, out_r);
      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = round(out_r[j]);
+        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));

      if (bit_depth_ == VPX_BITS_8) {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
@ -565,6 +593,62 @@ class Trans16x16TestBase {
      }
    }
  }
+  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                 pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 16x16 IDCT Comparison has error " << error
+            << " at index " << j;
+      }
+    }
+  }
  int pitch_;
  int tx_type_;
  vpx_bit_depth_t bit_depth_;
@ -590,10 +674,10 @@ class Trans16x16DCT
    mask_ = (1 << bit_depth_) - 1;
 #if CONFIG_VP9_HIGHBITDEPTH
    switch (bit_depth_) {
-      case 10:
+      case VPX_BITS_10:
        inv_txfm_ref = idct16x16_10_ref;
        break;
-      case 12:
+      case VPX_BITS_12:
        inv_txfm_ref = idct16x16_12_ref;
        break;
      default:
@ -703,6 +787,37 @@ TEST_P(Trans16x16HT, QuantCheck) {
  RunQuantCheck(429, 729);
 }

+class InvTrans16x16DCT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<Idct16x16Param> {
+ public:
+  virtual ~InvTrans16x16DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    pitch_    = 16;
+    mask_ = (1 << bit_depth_) - 1;
+}
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans16x16DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
@ -772,6 +887,51 @@ INSTANTIATE_TEST_CASE_P(
                   VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct16x16_sse2,
+                   &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct16x16_c,
+                   &idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct16x16_sse2,
+                   &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct16x16_c,
+                   &idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct16x16_sse2,
+                   &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
+                   VPX_BITS_8)));
+// Optimizations take effect at a threshold of 3155, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans16x16DCT,
+    ::testing::Values(
+        make_tuple(&idct16x16_10_add_10_c,
+                   &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10,
+                   &idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10_add_12_c,
+                   &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
+        make_tuple(&idct16x16_12,
+                   &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+#endif
+
 #if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    SSSE3, Trans16x16DCT,
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@ -79,6 +79,10 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
    Trans32x32Param;

 #if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 8);
+}
+
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 10);
 }
@ -114,7 +118,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  uint32_t max_error = 0;
  int64_t total_error = 0;
-  const int count_test_block = 1000;
+  const int count_test_block = 10000;
  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
@ -127,7 +131,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-mask_, mask_].
    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (bit_depth_ == 8) {
+      if (bit_depth_ == VPX_BITS_8) {
        src[j] = rnd.Rand8();
        dst[j] = rnd.Rand8();
        test_input_block[j] = src[j] - dst[j];
@ -282,7 +286,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {

    reference_32x32_dct_2d(in, out_r);
    for (int j = 0; j < kNumCoeffs; ++j)
-      coeff[j] = round(out_r[j]);
+      coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
    if (bit_depth_ == VPX_BITS_8) {
      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
 #if CONFIG_VP9_HIGHBITDEPTH
@ -353,6 +357,22 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
+                   VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
+                   VPX_BITS_12),
+        make_tuple(&vp9_fdct32x32_sse2, &vp9_idct32x32_1024_add_c, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_sse2, &vp9_idct32x32_1024_add_c, 1,
+                   VPX_BITS_8)));
+#endif
+
 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    AVX2, Trans32x32Test,
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@ -110,6 +110,7 @@ void EncoderTest::SetMode(TestMode mode) {
 static bool compare_img(const vpx_image_t *img1,
                        const vpx_image_t *img2) {
  bool match = (img1->fmt == img2->fmt) &&
+               (img1->cs == img2->cs) &&
               (img1->d_w == img2->d_w) &&
               (img1->d_h == img2->d_h);

@ -141,6 +142,12 @@ void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
 void EncoderTest::RunLoop(VideoSource *video) {
  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();

+#if CONFIG_ROW_TILE
+  // Decode all tiles.
+  dec_cfg.tile_col = -1;
+  dec_cfg.tile_row = -1;
+#endif  // CONFIG_ROW_TILE
+
  stats_.Reset();

  ASSERT_TRUE(passes_ == 1 || passes_ == 2);
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@ -126,11 +126,6 @@ class Encoder {
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }

-  void Control(int ctrl_id, struct vpx_svc_layer_id *arg) {
-    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
-    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
-  }
-
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
  void Control(int ctrl_id, vpx_active_map_t *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int number_of_iterations = 1000;
+
+typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+                               const tran_low_t *dqcoeff, intptr_t block_size,
+                               int64_t *ssz, int bps);
+typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
+                        ErrorBlockParam;
+class ErrorBlockTest
+  : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+  virtual ~ErrorBlockTest() {}
+  virtual void SetUp() {
+    error_block_op_     = GET_PARAM(0);
+    ref_error_block_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  ErrorBlockFunc error_block_op_;
+  ErrorBlockFunc ref_error_block_op_;
+};
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int err_count = 0;
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      coeff[j]   = rnd(2<<20)-(1<<20);
+      dqcoeff[j] = rnd(2<<20)-(1<<20);
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Error Block Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  int max_val = ((1<<20)-1);
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int err_count = 0;
+    int k = (i / 9) % 5;
+
+    // Change the maximum coeff value, to test different bit boundaries
+    if ( k == 4 && (i % 9) == 0 ) {
+      max_val >>= 1;
+    }
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      if (k < 4) {  // Test at maximum values
+        coeff[j]   = k % 2 ? max_val : -max_val;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+      } else {
+        coeff[j]   = rnd(2 << 14) - (1 << 14);
+        dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+      }
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Error Block Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+  SSE2_C_COMPARE, ErrorBlockTest,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_block_error_sse2,
+               &vp9_highbd_block_error_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_block_error_sse2,
+               &vp9_highbd_block_error_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_block_error_sse2,
+               &vp9_highbd_block_error_c, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@ -75,6 +75,16 @@ void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
 void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
  vp9_highbd_iwht4x4_16_add_c(in, out, stride, 12);
 }
+
+#if HAVE_SSE2
+void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
+}
+
+void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
+}
+#endif
 #endif

 class Trans4x4TestBase {
@ -496,4 +506,31 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct4x4_sse2,      &vp9_idct4x4_16_add_c, 0,
+                   VPX_BITS_8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif
 }  // namespace
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@ -71,6 +71,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

 typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;

 void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
  vp9_fdct8x8_c(in, out, stride);
@ -96,6 +97,32 @@ void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
 }
+
+void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_c(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+}
+
+void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+}
+#endif
 #endif

 class FwdTrans8x8TestBase {
@ -146,9 +173,10 @@ class FwdTrans8x8TestBase {
    memset(count_sign_block, 0, sizeof(count_sign_block));

    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-15, 15].
+      // Initialize a test block with input range [-mask_/16, mask_/16].
      for (int j = 0; j < 64; ++j)
-        test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+        test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
+                              ((rnd.Rand16() & mask_) >> 4);
      ASM_REGISTER_STATE_CHECK(
          RunFwdTxfm(test_input_block, test_output_block, pitch_));

@ -188,7 +216,7 @@ class FwdTrans8x8TestBase {
 #endif

    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < 64; ++j) {
        if (bit_depth_ == VPX_BITS_8) {
          src[j] = rnd.Rand8();
@ -427,6 +455,63 @@ class FwdTrans8x8TestBase {
      }
    }
  }
+
+void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 12;
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
+    const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
  int pitch_;
  int tx_type_;
  FhtFunc fwd_txfm_ref;
@ -526,6 +611,38 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) {
  RunExtremalCheck();
 }

+class InvTrans8x8DCT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<Idct8x8Param> {
+ public:
+  virtual ~InvTrans8x8DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_   = GET_PARAM(2);
+    pitch_    = 8;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans8x8DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
@ -598,6 +715,45 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct8x8_c,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct8x8_c,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+
+// Optimizations take effect at a threshold of 6201, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&idct8x8_10_add_10_c,
+                   &idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10,
+                   &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10_add_12_c,
+                   &idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
+        make_tuple(&idct8x8_12,
+                   &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
+#endif
+
+
 #if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
    !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@ -23,6 +23,8 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_integer.h"

+#define MAX_LOOP_FILTER 63
+
 using libvpx_test::ACMRandom;

 namespace {
@ -160,11 +162,18 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@ -246,11 +255,18 @@ TEST_P(Loop8Test6Param, ValueCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@ -305,11 +321,19 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    // lim  <= MAX_LOOP_FILTER
+    while (tmp > MAX_LOOP_FILTER) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@ -320,11 +344,18 @@ TEST_P(Loop8Test9Param, OperationCheck) {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@ -407,11 +438,18 @@ TEST_P(Loop8Test9Param, ValueCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@ -422,11 +460,18 @@ TEST_P(Loop8Test9Param, ValueCheck) {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@ -0,0 +1,209 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 500;
+
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride);
+typedef std::tr1::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+
+class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
+ public:
+  virtual ~MaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_   = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedSADFunc maskedSAD_op_;
+  MaskedSADFunc ref_maskedSAD_op_;
+};
+
+TEST_P(MaskedSADTest, OperationCheck) {
+  unsigned int ref_ret, ret;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  src_ptr, 4096);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  ref_ptr, 4096);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, 4096);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = 64;
+  int ref_stride = 64;
+  int msk_stride = 64;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < 4096; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
+    }
+
+    ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+                                msk_ptr, msk_stride);
+    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride,
+                                                 ref_ptr, ref_stride,
+                                                 msk_ptr, msk_stride));
+    if (ret != ref_ret) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+  EXPECT_EQ(0, err_count)
+    << "Error: Masked SAD Test, C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            const uint8_t *m, int m_stride);
+typedef std::tr1::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+    HighbdMaskedSADParam;
+
+class HighbdMaskedSADTest : public ::testing::
+        TestWithParam<HighbdMaskedSADParam> {
+ public:
+  virtual ~HighbdMaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_   = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  HighbdMaskedSADFunc maskedSAD_op_;
+  HighbdMaskedSADFunc ref_maskedSAD_op_;
+};
+
+TEST_P(HighbdMaskedSADTest, OperationCheck) {
+  unsigned int ref_ret, ret;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint16_t,  src_ptr, 4096);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t,  ref_ptr, 4096);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, 4096);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = 64;
+  int ref_stride = 64;
+  int msk_stride = 64;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < 4096; j++) {
+      src_ptr[j] = rnd.Rand16()&0xfff;
+      ref_ptr[j] = rnd.Rand16()&0xfff;
+      msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
+    }
+
+    ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                                msk_ptr, msk_stride);
+    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+                                                 ref8_ptr, ref_stride,
+                                                 msk_ptr, msk_stride));
+    if (ret != ref_ret) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+  EXPECT_EQ(0, err_count)
+    << "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, MaskedSADTest,
+  ::testing::Values(
+    make_tuple(&vp9_masked_sad64x64_ssse3,
+               &vp9_masked_sad64x64_c),
+    make_tuple(&vp9_masked_sad64x32_ssse3,
+               &vp9_masked_sad64x32_c),
+    make_tuple(&vp9_masked_sad32x64_ssse3,
+               &vp9_masked_sad32x64_c),
+    make_tuple(&vp9_masked_sad32x32_ssse3,
+               &vp9_masked_sad32x32_c),
+    make_tuple(&vp9_masked_sad32x16_ssse3,
+               &vp9_masked_sad32x16_c),
+    make_tuple(&vp9_masked_sad16x32_ssse3,
+               &vp9_masked_sad16x32_c),
+    make_tuple(&vp9_masked_sad16x16_ssse3,
+               &vp9_masked_sad16x16_c),
+    make_tuple(&vp9_masked_sad16x8_ssse3,
+               &vp9_masked_sad16x8_c),
+    make_tuple(&vp9_masked_sad8x16_ssse3,
+               &vp9_masked_sad8x16_c),
+    make_tuple(&vp9_masked_sad8x8_ssse3,
+               &vp9_masked_sad8x8_c),
+    make_tuple(&vp9_masked_sad8x4_ssse3,
+               &vp9_masked_sad8x4_c),
+    make_tuple(&vp9_masked_sad4x8_ssse3,
+               &vp9_masked_sad4x8_c),
+    make_tuple(&vp9_masked_sad4x4_ssse3,
+               &vp9_masked_sad4x4_c)));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, HighbdMaskedSADTest,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_masked_sad64x64_ssse3,
+               &vp9_highbd_masked_sad64x64_c),
+    make_tuple(&vp9_highbd_masked_sad64x32_ssse3,
+               &vp9_highbd_masked_sad64x32_c),
+    make_tuple(&vp9_highbd_masked_sad32x64_ssse3,
+               &vp9_highbd_masked_sad32x64_c),
+    make_tuple(&vp9_highbd_masked_sad32x32_ssse3,
+               &vp9_highbd_masked_sad32x32_c),
+    make_tuple(&vp9_highbd_masked_sad32x16_ssse3,
+               &vp9_highbd_masked_sad32x16_c),
+    make_tuple(&vp9_highbd_masked_sad16x32_ssse3,
+               &vp9_highbd_masked_sad16x32_c),
+    make_tuple(&vp9_highbd_masked_sad16x16_ssse3,
+               &vp9_highbd_masked_sad16x16_c),
+    make_tuple(&vp9_highbd_masked_sad16x8_ssse3,
+               &vp9_highbd_masked_sad16x8_c),
+    make_tuple(&vp9_highbd_masked_sad8x16_ssse3,
+               &vp9_highbd_masked_sad8x16_c),
+    make_tuple(&vp9_highbd_masked_sad8x8_ssse3,
+               &vp9_highbd_masked_sad8x8_c),
+    make_tuple(&vp9_highbd_masked_sad8x4_ssse3,
+               &vp9_highbd_masked_sad8x4_c),
+    make_tuple(&vp9_highbd_masked_sad4x8_ssse3,
+               &vp9_highbd_masked_sad4x8_c),
+    make_tuple(&vp9_highbd_masked_sad4x4_ssse3,
+               &vp9_highbd_masked_sad4x4_c)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
+}  // namespace
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@ -0,0 +1,753 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_filter.h"
+
+#define MAX_SIZE 64
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 500;
+
+typedef unsigned int (*MaskedVarianceFunc)(const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           const uint8_t *m, int m_stride,
+                                           unsigned int *sse);
+
+typedef std::tr1::tuple<MaskedVarianceFunc,
+                        MaskedVarianceFunc> MaskedVarianceParam;
+
+class MaskedVarianceTest :
+  public ::testing::TestWithParam<MaskedVarianceParam> {
+ public:
+  virtual ~MaskedVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedVarianceFunc opt_func_;
+  MaskedVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  src_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  ref_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, MAX_SIZE*MAX_SIZE);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SIZE;
+  int ref_stride = MAX_SIZE;
+  int msk_stride = MAX_SIZE;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SIZE*MAX_SIZE; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+
+    ref_ret = ref_func_(src_ptr, src_stride,
+                        ref_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                 ref_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test OperationCheck,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  src_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  ref_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, MAX_SIZE*MAX_SIZE);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SIZE;
+  int ref_stride = MAX_SIZE;
+  int msk_stride = MAX_SIZE;
+
+  for (int i = 0; i < 8; ++i) {
+    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SIZE*MAX_SIZE);
+    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SIZE*MAX_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SIZE*MAX_SIZE);
+
+    ref_ret = ref_func_(src_ptr, src_stride,
+                        ref_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                 ref_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+typedef unsigned int (*MaskedSubPixelVarianceFunc)(
+    const uint8_t *a, int a_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride,
+    unsigned int *sse);
+
+typedef std::tr1::tuple<MaskedSubPixelVarianceFunc,
+                        MaskedSubPixelVarianceFunc> MaskedSubPixelVarianceParam;
+
+class MaskedSubPixelVarianceTest :
+  public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
+ public:
+  virtual ~MaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  src_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  ref_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SIZE+1);
+  int ref_stride = (MAX_SIZE+1);
+  int msk_stride = (MAX_SIZE+1);
+  int xoffset;
+  int yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int xoffsets[] = {0, 8, rnd(SUBPEL_SHIFTS)};
+    int yoffsets[] = {0, 8, rnd(SUBPEL_SHIFTS)};
+    for (int j = 0; j < (MAX_SIZE+1)*(MAX_SIZE+1); j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+    for (int k = 0; k < 3; k++) {
+      xoffset = xoffsets[k];
+      for (int l = 0; l < 3; l++) {
+        xoffset = xoffsets[k];
+        yoffset = yoffsets[l];
+
+        ref_ret = ref_func_(src_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                    xoffset, yoffset,
+                                                    ref_ptr, ref_stride,
+                                                    msk_ptr, msk_stride,
+                                                    &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+        err_count++;
+        if (first_failure == -1)
+            first_failure = i;
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+    << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+    << "C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  src_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  ref_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SIZE+1);
+  int ref_stride = (MAX_SIZE+1);
+  int msk_stride = (MAX_SIZE+1);
+
+  for (int xoffset = 0 ; xoffset < SUBPEL_SHIFTS ; xoffset++) {
+    for (int yoffset = 0 ; yoffset < SUBPEL_SHIFTS ; yoffset++) {
+      for (int i = 0; i < 8; ++i) {
+        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+
+        ref_ret = ref_func_(src_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                     xoffset, yoffset,
+                                                     ref_ptr, ref_stride,
+                                                     msk_ptr, msk_stride,
+                                                     &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure
+  << " x_offset = " << first_failure_x
+  << " y_offset = " << first_failure_y;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef std::tr1::tuple<MaskedVarianceFunc,
+                        MaskedVarianceFunc,
+                        vpx_bit_depth_t> HighbdMaskedVarianceParam;
+
+class HighbdMaskedVarianceTest :
+  public ::testing::TestWithParam<HighbdMaskedVarianceParam> {
+ public:
+  virtual ~HighbdMaskedVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedVarianceFunc opt_func_;
+  MaskedVarianceFunc ref_func_;
+  vpx_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, MAX_SIZE*MAX_SIZE);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SIZE;
+  int ref_stride = MAX_SIZE;
+  int msk_stride = MAX_SIZE;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SIZE*MAX_SIZE; j++) {
+      src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      msk_ptr[j] = rnd(65);
+    }
+
+    ref_ret = ref_func_(src8_ptr, src_stride,
+                        ref8_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                 ref8_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test OperationCheck,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+TEST_P(HighbdMaskedVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_ptr, MAX_SIZE*MAX_SIZE);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, MAX_SIZE*MAX_SIZE);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SIZE;
+  int ref_stride = MAX_SIZE;
+  int msk_stride = MAX_SIZE;
+
+  for (int i = 0; i < 8; ++i) {
+    vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+                 MAX_SIZE*MAX_SIZE);
+    vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+                 MAX_SIZE*MAX_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SIZE*MAX_SIZE);
+
+    ref_ret = ref_func_(src8_ptr, src_stride,
+                        ref8_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                 ref8_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+typedef std::tr1::tuple<MaskedSubPixelVarianceFunc,
+                        MaskedSubPixelVarianceFunc,
+                        vpx_bit_depth_t> HighbdMaskedSubPixelVarianceParam;
+
+class HighbdMaskedSubPixelVarianceTest :
+  public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
+ public:
+  virtual ~HighbdMaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+  vpx_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int src_stride = (MAX_SIZE+1);
+  int ref_stride = (MAX_SIZE+1);
+  int msk_stride = (MAX_SIZE+1);
+  int xoffset, yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (xoffset = 0; xoffset < SUBPEL_SHIFTS; xoffset++) {
+      for (yoffset = 0; yoffset < SUBPEL_SHIFTS; yoffset++) {
+        for (int j = 0; j < (MAX_SIZE+1)*(MAX_SIZE+1); j++) {
+          src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+          ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+          msk_ptr[j] = rnd(65);
+        }
+
+        ref_ret = ref_func_(src8_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref8_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                     xoffset, yoffset,
+                                                     ref8_ptr, ref_stride,
+                                                     msk_ptr, msk_stride,
+                                                     &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+    << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+    << "C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure
+    << " x_offset = " << first_failure_x
+    << " y_offset = " << first_failure_y;
+}
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  msk_ptr, (MAX_SIZE+1)*(MAX_SIZE+1));
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SIZE+1);
+  int ref_stride = (MAX_SIZE+1);
+  int msk_stride = (MAX_SIZE+1);
+
+  for (int xoffset = 0 ; xoffset < SUBPEL_SHIFTS ; xoffset++) {
+    for (int yoffset = 0 ; yoffset < SUBPEL_SHIFTS ; yoffset++) {
+      for (int i = 0; i < 8; ++i) {
+        vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+               (MAX_SIZE+1)*(MAX_SIZE+1));
+        vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+               (MAX_SIZE+1)*(MAX_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+
+        ref_ret = ref_func_(src8_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref8_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                     xoffset, yoffset,
+                                                     ref8_ptr, ref_stride,
+                                                     msk_ptr, msk_stride,
+                                                     &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure
+  << " x_offset = " << first_failure_x
+  << " y_offset = " << first_failure_y;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, MaskedVarianceTest,
+  ::testing::Values(
+    make_tuple(&vp9_masked_variance64x64_ssse3,
+               &vp9_masked_variance64x64_c),
+    make_tuple(&vp9_masked_variance64x32_ssse3,
+               &vp9_masked_variance64x32_c),
+    make_tuple(&vp9_masked_variance32x64_ssse3,
+               &vp9_masked_variance32x64_c),
+    make_tuple(&vp9_masked_variance32x32_ssse3,
+               &vp9_masked_variance32x32_c),
+    make_tuple(&vp9_masked_variance32x16_ssse3,
+               &vp9_masked_variance32x16_c),
+    make_tuple(&vp9_masked_variance16x32_ssse3,
+               &vp9_masked_variance16x32_c),
+    make_tuple(&vp9_masked_variance16x16_ssse3,
+               &vp9_masked_variance16x16_c),
+    make_tuple(&vp9_masked_variance16x8_ssse3,
+               &vp9_masked_variance16x8_c),
+    make_tuple(&vp9_masked_variance8x16_ssse3,
+               &vp9_masked_variance8x16_c),
+    make_tuple(&vp9_masked_variance8x8_ssse3,
+               &vp9_masked_variance8x8_c),
+    make_tuple(&vp9_masked_variance8x4_ssse3,
+               &vp9_masked_variance8x4_c),
+    make_tuple(&vp9_masked_variance4x8_ssse3,
+               &vp9_masked_variance4x8_c),
+    make_tuple(&vp9_masked_variance4x4_ssse3,
+               &vp9_masked_variance4x4_c)));
+
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
+  ::testing::Values(
+    make_tuple(&vp9_masked_sub_pixel_variance64x64_ssse3,
+              &vp9_masked_sub_pixel_variance64x64_c),
+    make_tuple(&vp9_masked_sub_pixel_variance64x32_ssse3,
+              &vp9_masked_sub_pixel_variance64x32_c),
+    make_tuple(&vp9_masked_sub_pixel_variance32x64_ssse3,
+              &vp9_masked_sub_pixel_variance32x64_c),
+    make_tuple(&vp9_masked_sub_pixel_variance32x32_ssse3,
+              &vp9_masked_sub_pixel_variance32x32_c),
+    make_tuple(&vp9_masked_sub_pixel_variance32x16_ssse3,
+              &vp9_masked_sub_pixel_variance32x16_c),
+    make_tuple(&vp9_masked_sub_pixel_variance16x32_ssse3,
+              &vp9_masked_sub_pixel_variance16x32_c),
+    make_tuple(&vp9_masked_sub_pixel_variance16x16_ssse3,
+              &vp9_masked_sub_pixel_variance16x16_c),
+    make_tuple(&vp9_masked_sub_pixel_variance16x8_ssse3,
+              &vp9_masked_sub_pixel_variance16x8_c),
+    make_tuple(&vp9_masked_sub_pixel_variance8x16_ssse3,
+              &vp9_masked_sub_pixel_variance8x16_c),
+    make_tuple(&vp9_masked_sub_pixel_variance8x8_ssse3,
+              &vp9_masked_sub_pixel_variance8x8_c),
+    make_tuple(&vp9_masked_sub_pixel_variance8x4_ssse3,
+              &vp9_masked_sub_pixel_variance8x4_c),
+    make_tuple(&vp9_masked_sub_pixel_variance4x8_ssse3,
+              &vp9_masked_sub_pixel_variance4x8_c),
+    make_tuple(&vp9_masked_sub_pixel_variance4x4_ssse3,
+              &vp9_masked_sub_pixel_variance4x4_c)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, HighbdMaskedVarianceTest,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_masked_variance64x64_ssse3,
+               &vp9_highbd_masked_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance64x32_ssse3,
+               &vp9_highbd_masked_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance32x64_ssse3,
+               &vp9_highbd_masked_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance32x32_ssse3,
+               &vp9_highbd_masked_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance32x16_ssse3,
+               &vp9_highbd_masked_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance16x32_ssse3,
+               &vp9_highbd_masked_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance16x16_ssse3,
+               &vp9_highbd_masked_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance16x8_ssse3,
+               &vp9_highbd_masked_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance8x16_ssse3,
+               &vp9_highbd_masked_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance8x8_ssse3,
+               &vp9_highbd_masked_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance8x4_ssse3,
+               &vp9_highbd_masked_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance4x8_ssse3,
+               &vp9_highbd_masked_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_variance4x4_ssse3,
+               &vp9_highbd_masked_variance4x4_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_10_masked_variance64x64_ssse3,
+               &vp9_highbd_10_masked_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance64x32_ssse3,
+               &vp9_highbd_10_masked_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance32x64_ssse3,
+               &vp9_highbd_10_masked_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance32x32_ssse3,
+               &vp9_highbd_10_masked_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance32x16_ssse3,
+               &vp9_highbd_10_masked_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance16x32_ssse3,
+               &vp9_highbd_10_masked_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance16x16_ssse3,
+               &vp9_highbd_10_masked_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance16x8_ssse3,
+               &vp9_highbd_10_masked_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance8x16_ssse3,
+               &vp9_highbd_10_masked_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance8x8_ssse3,
+               &vp9_highbd_10_masked_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance8x4_ssse3,
+               &vp9_highbd_10_masked_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance4x8_ssse3,
+               &vp9_highbd_10_masked_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_variance4x4_ssse3,
+               &vp9_highbd_10_masked_variance4x4_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_12_masked_variance64x64_ssse3,
+               &vp9_highbd_12_masked_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance64x32_ssse3,
+               &vp9_highbd_12_masked_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance32x64_ssse3,
+               &vp9_highbd_12_masked_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance32x32_ssse3,
+               &vp9_highbd_12_masked_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance32x16_ssse3,
+               &vp9_highbd_12_masked_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance16x32_ssse3,
+               &vp9_highbd_12_masked_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance16x16_ssse3,
+               &vp9_highbd_12_masked_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance16x8_ssse3,
+               &vp9_highbd_12_masked_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance8x16_ssse3,
+               &vp9_highbd_12_masked_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance8x8_ssse3,
+               &vp9_highbd_12_masked_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance8x4_ssse3,
+               &vp9_highbd_12_masked_variance8x4_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance4x8_ssse3,
+               &vp9_highbd_12_masked_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_variance4x4_ssse3,
+               &vp9_highbd_12_masked_variance4x4_c, VPX_BITS_12)));
+
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance64x64_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance64x32_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x64_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x32_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x16_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x32_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x16_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x8_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x16_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x8_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x4_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance4x8_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_masked_sub_pixel_variance4x4_ssse3,
+               &vp9_highbd_masked_sub_pixel_variance4x4_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+               &vp9_highbd_10_masked_sub_pixel_variance4x4_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance8x4_c, VPX_BITS_12) ,
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+               &vp9_highbd_12_masked_sub_pixel_variance4x4_c, VPX_BITS_12)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // HAVE_SSSE3
+}  // namespace
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@ -0,0 +1,344 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int number_of_iterations = 100;
+
+typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
+                           int skip_block, const int16_t *zbin,
+                           const int16_t *round, const int16_t *quant,
+                           const int16_t *quant_shift,
+                           tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                           const int16_t *dequant, uint16_t *eob,
+                           const int16_t *scan, const int16_t *iscan);
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, int>
+    QuantizeParam;
+class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~QuantizeTest() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+class Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~Quantize32Test() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+TEST_P(QuantizeTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,           1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_intra_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+TEST_P(Quantize32Test, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,           1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = TX_32X32;
+    TX_TYPE tx_type = (TX_TYPE)(i % 4);
+
+    const scan_order *scan_order = &vp9_intra_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+TEST_P(QuantizeTest, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,       1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_intra_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    // Two random entries
+    for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+    }
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+TEST_P(Quantize32Test, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,       1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = TX_32X32;
+    TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_intra_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+    }
+    // Two random entries
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+  SSE2_C_COMPARE, QuantizeTest,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_quantize_b_sse2,
+               &vp9_highbd_quantize_b_c, 8),
+    make_tuple(&vp9_highbd_quantize_b_sse2,
+               &vp9_highbd_quantize_b_c, 10),
+    make_tuple(&vp9_highbd_quantize_b_sse2,
+               &vp9_highbd_quantize_b_c, 12)));
+INSTANTIATE_TEST_CASE_P(
+  SSE2_C_COMPARE, Quantize32Test,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+               &vp9_highbd_quantize_b_32x32_c, 8),
+    make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+               &vp9_highbd_quantize_b_32x32_c, 10),
+    make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+               &vp9_highbd_quantize_b_32x32_c, 12)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@ -1,740 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/i420_video_source.h"
-
-#include "vp9/decoder/vp9_decoder.h"
-
-#include "vpx/svc_context.h"
-#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
-
-namespace {
-
-using libvpx_test::CodecFactory;
-using libvpx_test::Decoder;
-using libvpx_test::DxDataIterator;
-using libvpx_test::VP9CodecFactory;
-
-class SvcTest : public ::testing::Test {
- protected:
-  static const uint32_t kWidth = 352;
-  static const uint32_t kHeight = 288;
-
-  SvcTest()
-      : codec_iface_(0),
-        test_file_name_("hantro_collage_w352h288.yuv"),
-        codec_initialized_(false),
-        decoder_(0) {
-    memset(&svc_, 0, sizeof(svc_));
-    memset(&codec_, 0, sizeof(codec_));
-    memset(&codec_enc_, 0, sizeof(codec_enc_));
-  }
-
-  virtual ~SvcTest() {}
-
-  virtual void SetUp() {
-    svc_.log_level = SVC_LOG_DEBUG;
-    svc_.log_print = 0;
-
-    codec_iface_ = vpx_codec_vp9_cx();
-    const vpx_codec_err_t res =
-        vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-
-    codec_enc_.g_w = kWidth;
-    codec_enc_.g_h = kHeight;
-    codec_enc_.g_timebase.num = 1;
-    codec_enc_.g_timebase.den = 60;
-    codec_enc_.kf_min_dist = 100;
-    codec_enc_.kf_max_dist = 100;
-
-    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
-    VP9CodecFactory codec_factory;
-    decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
-  }
-
-  virtual void TearDown() {
-    ReleaseEncoder();
-    delete(decoder_);
-  }
-
-  void InitializeEncoder() {
-    const vpx_codec_err_t res =
-        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
-    codec_initialized_ = true;
-  }
-
-  void ReleaseEncoder() {
-    vpx_svc_release(&svc_);
-    if (codec_initialized_) vpx_codec_destroy(&codec_);
-    codec_initialized_ = false;
-  }
-
-  void GetStatsData(std::string *const stats_buf) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
-        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
-        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
-        stats_buf->append(static_cast<char*>(cx_pkt->data.twopass_stats.buf),
-                          cx_pkt->data.twopass_stats.sz);
-      }
-    }
-  }
-
-  void Pass1EncodeNFrames(const int n, const int layers,
-                          std::string *const stats_buf) {
-    vpx_codec_err_t res;
-
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                       codec_enc_.g_timebase.den,
-                                       codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
-
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      GetStatsData(stats_buf);
-      video.Next();
-    }
-
-    // Flush encoder and test EOS packet.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
-                         video.duration(), VPX_DL_GOOD_QUALITY);
-    ASSERT_EQ(VPX_CODEC_OK, res);
-    GetStatsData(stats_buf);
-
-    ReleaseEncoder();
-  }
-
-  void StoreFrames(const size_t max_frame_received,
-                   struct vpx_fixed_buf *const outputs,
-                   size_t *const frame_received) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
-        const size_t frame_size = cx_pkt->data.frame.sz;
-
-        EXPECT_GT(frame_size, 0U);
-        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
-        ASSERT_LT(*frame_received, max_frame_received);
-
-        if (*frame_received == 0)
-          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
-
-        outputs[*frame_received].buf = malloc(frame_size + 16);
-        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
-        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
-               frame_size);
-        outputs[*frame_received].sz = frame_size;
-        ++(*frame_received);
-      }
-    }
-  }
-
-  void Pass2EncodeNFrames(std::string *const stats_buf,
-                          const int n, const int layers,
-                          struct vpx_fixed_buf *const outputs) {
-    vpx_codec_err_t res;
-    size_t frame_received = 0;
-
-    ASSERT_TRUE(outputs != NULL);
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.rc_target_bitrate = 500;
-    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
-      ASSERT_TRUE(stats_buf != NULL);
-      ASSERT_GT(stats_buf->size(), 0U);
-      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
-      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
-    }
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                       codec_enc_.g_timebase.den,
-                                       codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
-
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      StoreFrames(n, outputs, &frame_received);
-      video.Next();
-    }
-
-    // Flush encoder.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                         video.duration(), VPX_DL_GOOD_QUALITY);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    StoreFrames(n, outputs, &frame_received);
-
-    EXPECT_EQ(frame_received, static_cast<size_t>(n));
-
-    ReleaseEncoder();
-  }
-
-  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
-    int decoded_frames = 0;
-    int received_frames = 0;
-
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-      const vpx_codec_err_t res_dec =
-          decoder_->DecodeFrame(static_cast<const uint8_t *>(inputs[i].buf),
-                                inputs[i].sz);
-      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-      ++decoded_frames;
-
-      DxDataIterator dec_iter = decoder_->GetDxData();
-      while (dec_iter.Next() != NULL) {
-        ++received_frames;
-      }
-    }
-    EXPECT_EQ(decoded_frames, n);
-    EXPECT_EQ(received_frames, n);
-  }
-
-  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
-                             const int num_super_frames,
-                             const int remained_spatial_layers) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(num_super_frames, 0);
-    ASSERT_GT(remained_spatial_layers, 0);
-
-    for (int i = 0; i < num_super_frames; ++i) {
-      uint32_t frame_sizes[8] = {0};
-      int frame_count = 0;
-      int frames_found = 0;
-      int frame;
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-
-      vpx_codec_err_t res =
-          vp9_parse_superframe_index(static_cast<const uint8_t*>(inputs[i].buf),
-                                     inputs[i].sz, frame_sizes, &frame_count,
-                                     NULL, NULL);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-
-      if (frame_count == 0) {
-        // There's no super frame but only a single frame.
-        ASSERT_EQ(1, remained_spatial_layers);
-      } else {
-        // Found a super frame.
-        uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
-        uint8_t *frame_start = frame_data;
-        for (frame = 0; frame < frame_count; ++frame) {
-          // Looking for a visible frame.
-          if (frame_data[0] & 0x02) {
-            ++frames_found;
-            if (frames_found == remained_spatial_layers)
-              break;
-          }
-          frame_data += frame_sizes[frame];
-        }
-        ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
-            << "remained_spatial_layers: " << remained_spatial_layers
-            << "    super_frame: " << i;
-        if (frame == frame_count - 1)
-          continue;
-
-        frame_data += frame_sizes[frame];
-
-        // We need to add one more frame for multiple frame contexts.
-        uint8_t marker =
-            static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1];
-        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-        const size_t index_sz = 2 + mag * frame_count;
-        const size_t new_index_sz = 2 + mag * (frame + 1);
-        marker &= 0x0f8;
-        marker |= frame;
-
-        // Copy existing frame sizes.
-        memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
-                new_index_sz - 2);
-        // New marker.
-        frame_data[0] = marker;
-        frame_data += (mag * (frame + 1) + 1);
-
-        *frame_data++ = marker;
-        inputs[i].sz = frame_data - frame_start;
-      }
-    }
-  }
-
-  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      free(inputs[i].buf);
-      inputs[i].buf = NULL;
-      inputs[i].sz = 0;
-    }
-  }
-
-  SvcContext svc_;
-  vpx_codec_ctx_t codec_;
-  struct vpx_codec_enc_cfg codec_enc_;
-  vpx_codec_iface_t *codec_iface_;
-  std::string test_file_name_;
-  bool codec_initialized_;
-  Decoder *decoder_;
-};
-
-TEST_F(SvcTest, SvcInit) {
-  // test missing parameters
-  vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 6;  // too many layers
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 0;  // use default layers
-  InitializeEncoder();
-  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, InitTwoLayers) {
-  svc_.spatial_layers = 2;
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, InvalidOptions) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "not-an-option=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-}
-
-TEST_F(SvcTest, SetLayersOption) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(3, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, SetMultipleOptions) {
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(2, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, SetScaleFactorsOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetAutoAltRefOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "auto-alt-refs=none");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1,1,0");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  InitializeEncoder();
-}
-
-// Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, OnePassEncodeOneFrame) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  vpx_fixed_buf output = {0};
-  Pass2EncodeNFrames(NULL, 1, 2, &output);
-  DecodeNFrames(&output, 1);
-  FreeBitstreamBuffers(&output, 1);
-}
-
-TEST_F(SvcTest, OnePassEncodeThreeFrames) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  vpx_fixed_buf outputs[3];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 3);
-  FreeBitstreamBuffers(&outputs[0], 3);
-}
-
-TEST_F(SvcTest, TwoPassEncode10Frames) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 5, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 4);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 3);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1 scale-factors=1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(20, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 2);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 1);
-  DecodeNFrames(&outputs[0], 20);
-
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, SetMultipleFrameContextsOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 2;
-  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(10, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i)
-    base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i)
-    base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-}  // namespace
--- a/test/test.mk
+++ b/test/test.mk
@ -51,10 +51,10 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h

 ## WebM Parsing
 ifeq ($(CONFIG_WEBM_IO), yes)
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.cpp
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.hpp
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.hpp
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(LIBWEBM_PARSER_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../tools_common.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
@ -86,7 +86,7 @@ endif
 ifeq ($(CONFIG_SHARED),)

 ## VP8
-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)

 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
@ -112,7 +112,7 @@ endif
 endif # VP8

 ## VP9
-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)

 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
@ -136,16 +136,22 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += quantize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc

-ifeq ($(CONFIG_VP9_ENCODER),yes)
-LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
-endif
-
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
 LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
 endif

+ifeq ($(CONFIG_VP9)$(CONFIG_WEDGE_PARTITION),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
+endif
+
+ifeq ($(CONFIG_VP9)$(CONFIG_WEDGE_PARTITION),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
+endif
+
 endif # VP9

 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@ -97,13 +97,18 @@ TEST_P(TestVectorTest, MD5Match) {
  delete video;
 }

+#if CONFIG_VP8_DECODER
 VP8_INSTANTIATE_TEST_CASE(TestVectorTest,
                          ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                                              libvpx_test::kVP8TestVectors +
                                              libvpx_test::kNumVP8TestVectors));
+#endif  // CONFIG_VP8_DECODER
+
+#if CONFIG_VP9_DECODER
 VP9_INSTANTIATE_TEST_CASE(TestVectorTest,
                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                              libvpx_test::kVP9TestVectors +
                                              libvpx_test::kNumVP9TestVectors));
+#endif  // CONFIG_VP9_DECODER

 }  // namespace
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@ -22,7 +22,7 @@ const unsigned int kHeight = 90;
 const unsigned int kFramerate = 50;
 const unsigned int kFrames = 10;
 const int kBitrate = 500;
-const int kCpuUsed = 2;
+const int kCpuUsed = 0;
 const double psnr_threshold = 35.0;

 typedef struct {
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh
@ -1,72 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-##  This file tests the libvpx vp9_spatial_svc_encoder example. To add new
-##  tests to to this file, do the following:
-##    1. Write a shell function (this is your test).
-##    2. Add the function to vp9_spatial_svc_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: $YUV_RAW_INPUT is required.
-vp9_spatial_svc_encoder_verify_environment() {
-  if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
-    return 1
-  fi
-}
-
-# Runs vp9_spatial_svc_encoder. $1 is the test name.
-vp9_spatial_svc_encoder() {
-  local readonly \
-    encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}"
-  local readonly test_name="$1"
-  local readonly \
-    output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
-  local readonly frames_to_encode=10
-  local readonly max_kf=9999
-
-  shift
-
-  if [ ! -x "${encoder}" ]; then
-    elog "${encoder} does not exist or is not executable."
-    return 1
-  fi
-
-  eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \
-    -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
-    "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
-
-  [ -e "${output_file}" ] || return 1
-}
-
-# Each test is run with layer count 1-$vp9_ssvc_test_layers.
-vp9_ssvc_test_layers=5
-
-vp9_spatial_svc() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly test_name="vp9_spatial_svc"
-    for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -l ${layers}
-    done
-  fi
-}
-
-readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i
-                                DISABLED_vp9_spatial_svc_mode_altip
-                                DISABLED_vp9_spatial_svc_mode_ip
-                                DISABLED_vp9_spatial_svc_mode_gf
-                                vp9_spatial_svc"
-
-if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then
-  run_tests \
-    vp9_spatial_svc_encoder_verify_environment \
-    "${vp9_spatial_svc_tests}"
-fi
--- a/test/vpx_temporal_svc_encoder.sh
+++ b/test/vpx_temporal_svc_encoder.sh
@ -1,290 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-##  This file tests the libvpx vpx_temporal_svc_encoder example. To add new
-##  tests to this file, do the following:
-##    1. Write a shell function (this is your test).
-##    2. Add the function to vpx_tsvc_encoder_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: $YUV_RAW_INPUT is required.
-vpx_tsvc_encoder_verify_environment() {
-  if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
-    return 1
-  fi
-  if [ "$(vpx_config_option_enabled CONFIG_TEMPORAL_DENOISING)" != "yes" ]; then
-    elog "Warning: Temporal denoising is disabled! Spatial denoising will be " \
-      "used instead, which is probably not what you want for this test."
-  fi
-}
-
-# Runs vpx_temporal_svc_encoder using the codec specified by $1 and output file
-# name by $2. Additional positional parameters are passed directly to
-# vpx_temporal_svc_encoder.
-vpx_tsvc_encoder() {
-  local encoder="${LIBVPX_BIN_PATH}/vpx_temporal_svc_encoder"
-  encoder="${encoder}${VPX_TEST_EXE_SUFFIX}"
-  local codec="$1"
-  local output_file_base="$2"
-  local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}"
-  local timebase_num="1"
-  local timebase_den="1000"
-  local speed="6"
-  local frame_drop_thresh="30"
-
-  shift 2
-
-  if [ ! -x "${encoder}" ]; then
-    elog "${encoder} does not exist or is not executable."
-    return 1
-  fi
-
-  eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
-      "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
-      "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
-      "$@" \
-      ${devnull}
-}
-
-# Confirms that all expected output files exist given the output file name
-# passed to vpx_temporal_svc_encoder.
-# The file name passed to vpx_temporal_svc_encoder is joined with the stream
-# number and the extension .ivf to produce per stream output files.  Here $1 is
-# file name, and $2 is expected number of files.
-files_exist() {
-  local file_name="${VPX_TEST_OUTPUT_DIR}/$1"
-  local num_files="$(($2 - 1))"
-  for stream_num in $(seq 0 ${num_files}); do
-    [ -e "${file_name}_${stream_num}.ivf" ] || return 1
-  done
-}
-
-# Run vpx_temporal_svc_encoder in all supported modes for vp8 and vp9.
-
-vpx_tsvc_encoder_vp8_mode_0() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 0 200 || return 1
-    # Mode 0 produces 1 stream
-    files_exist "${FUNCNAME}" 1 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_1() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 1 200 400 || return 1
-    # Mode 1 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_2() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 2 200 400 || return 1
-    # Mode 2 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_3() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 3 200 400 600 || return 1
-    # Mode 3 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_4() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 4 200 400 600 || return 1
-    # Mode 4 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_5() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 5 200 400 600 || return 1
-    # Mode 5 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_6() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 6 200 400 600 || return 1
-    # Mode 6 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_7() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
-    # Mode 7 produces 5 streams
-    files_exist "${FUNCNAME}" 5 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_8() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 8 200 400 || return 1
-    # Mode 8 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_9() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 9 200 400 600 || return 1
-    # Mode 9 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_10() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 10 200 400 600 || return 1
-    # Mode 10 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp8_mode_11() {
-  if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 11 200 400 600 || return 1
-    # Mode 11 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_0() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 0 200 || return 1
-    # Mode 0 produces 1 stream
-    files_exist "${FUNCNAME}" 1 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_1() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 1 200 400 || return 1
-    # Mode 1 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_2() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 2 200 400 || return 1
-    # Mode 2 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_3() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 3 200 400 600 || return 1
-    # Mode 3 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_4() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 4 200 400 600 || return 1
-    # Mode 4 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_5() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 5 200 400 600 || return 1
-    # Mode 5 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_6() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 6 200 400 600 || return 1
-    # Mode 6 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_7() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
-    # Mode 7 produces 5 streams
-    files_exist "${FUNCNAME}" 5 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_8() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 8 200 400 || return 1
-    # Mode 8 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_9() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 9 200 400 600 || return 1
-    # Mode 9 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_10() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 10 200 400 600 || return 1
-    # Mode 10 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_vp9_mode_11() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 11 200 400 600 || return 1
-    # Mode 11 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
-  fi
-}
-
-vpx_tsvc_encoder_tests="vpx_tsvc_encoder_vp8_mode_0
-                        vpx_tsvc_encoder_vp8_mode_1
-                        vpx_tsvc_encoder_vp8_mode_2
-                        vpx_tsvc_encoder_vp8_mode_3
-                        vpx_tsvc_encoder_vp8_mode_4
-                        vpx_tsvc_encoder_vp8_mode_5
-                        vpx_tsvc_encoder_vp8_mode_6
-                        vpx_tsvc_encoder_vp8_mode_7
-                        vpx_tsvc_encoder_vp8_mode_8
-                        vpx_tsvc_encoder_vp8_mode_9
-                        vpx_tsvc_encoder_vp8_mode_10
-                        vpx_tsvc_encoder_vp8_mode_11
-                        vpx_tsvc_encoder_vp9_mode_0
-                        vpx_tsvc_encoder_vp9_mode_1
-                        vpx_tsvc_encoder_vp9_mode_2
-                        vpx_tsvc_encoder_vp9_mode_3
-                        vpx_tsvc_encoder_vp9_mode_4
-                        vpx_tsvc_encoder_vp9_mode_5
-                        vpx_tsvc_encoder_vp9_mode_6
-                        vpx_tsvc_encoder_vp9_mode_7
-                        vpx_tsvc_encoder_vp9_mode_8
-                        vpx_tsvc_encoder_vp9_mode_9
-                        vpx_tsvc_encoder_vp9_mode_10
-                        vpx_tsvc_encoder_vp9_mode_11"
-
-run_tests vpx_tsvc_encoder_verify_environment "${vpx_tsvc_encoder_tests}"
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@ -1,11 +1,17 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
+LOCAL_PATH:= $(call my-dir)

-LOCAL_CPP_EXTENSION := .cpp
-LOCAL_SRC_FILES := mkvmuxer.cpp \
-                   mkvmuxerutil.cpp \
-                   mkvparser.cpp \
-                   mkvreader.cpp \
-                   mkvwriter.cpp
-LOCAL_MODULE := libwebm
+include $(CLEAR_VARS)
+LOCAL_MODULE:= libwebm
+LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat
+LOCAL_C_INCLUDES:= $(LOCAL_PATH)
+LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
+
+LOCAL_SRC_FILES:= common/file_util.cc \
+                  common/hdr_util.cc \
+                  mkvparser/mkvparser.cc \
+                  mkvparser/mkvreader.cc \
+                  mkvmuxer/mkvmuxer.cc \
+                  mkvmuxer/mkvmuxerutil.cc \
+                  mkvmuxer/mkvwriter.cc
 include $(BUILD_STATIC_LIBRARY)
--- a/third_party/libwebm/PATENTS.TXT
+++ b/third_party/libwebm/PATENTS.TXT
@ -17,7 +17,7 @@ or agree to the institution of patent litigation or any other patent
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@ -1,7 +1,10 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 249629d46c6e9391f25a90cff6d19075f47474cb
+Version: 32d5ac49414a8914ec1e1f285f3f927c6e8ec29d
 License: BSD
 License File: LICENSE.txt

 Description:
 libwebm is used to handle WebM container I/O.
+
+Local Changes:
+* <none>
--- a/third_party/libwebm/RELEASE.TXT
+++ b/third_party/libwebm/RELEASE.TXT
@ -1,34 +0,0 @@
-1.0.0.5
- * Handled case when no duration
- * Handled empty clusters
- * Handled empty clusters when seeking
- * Implemented check lacing bits
-
-1.0.0.4
- * Made Cues member variables mutables
- * Defined against badly-formatted cue points
- * Segment::GetCluster returns CuePoint too
- * Separated cue-based searches
-
-1.0.0.3
- * Added Block::GetOffset() to get a frame's offset in a block
- * Changed cluster count type from size_t to long
- * Parsed SeekHead to find cues
- * Allowed seeking beyond end of cluster cache
- * Added not to attempt to reparse cues element
- * Restructured Segment::LoadCluster
- * Marked position of cues without parsing cues element
- * Allowed cue points to be loaded incrementally
- * Implemented to load lazily cue points as they're searched
- * Merged Cues::LoadCuePoint into Cues::Find
- * Lazy init cues
- * Loaded cue point during find
-
-1.0.0.2
- * added support for Cues element
- * seeking was improved
-
-1.0.0.1
- * fixed item 141
- * added item 142
- * added this file, RELEASE.TXT, to repository
--- a/third_party/libwebm/common/file_util.cc
+++ b/third_party/libwebm/common/file_util.cc
@ -0,0 +1,67 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "common/file_util.h"
+
+#include <sys/stat.h>
+#ifndef _MSC_VER
+#include <unistd.h>  // close()
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <ios>
+
+namespace libwebm {
+
+std::string GetTempFileName() {
+#if !defined _MSC_VER && !defined __MINGW32__
+  char temp_file_name_template[] = "libwebm_temp.XXXXXX";
+  int fd = mkstemp(temp_file_name_template);
+  if (fd != -1) {
+    close(fd);
+    return std::string(temp_file_name_template);
+  }
+  return std::string();
+#else
+  char tmp_file_name[_MAX_PATH];
+  errno_t err = tmpnam_s(tmp_file_name);
+  if (err == 0) {
+    return std::string(tmp_file_name);
+  }
+  return std::string();
+#endif
+}
+
+uint64_t GetFileSize(const std::string& file_name) {
+  uint64_t file_size = 0;
+#ifndef _MSC_VER
+  struct stat st;
+  st.st_size = 0;
+  if (stat(file_name.c_str(), &st) == 0) {
+#else
+  struct _stat st;
+  st.st_size = 0;
+  if (_stat(file_name.c_str(), &st) == 0) {
+#endif
+    file_size = st.st_size;
+  }
+  return file_size;
+}
+
+TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
+
+TempFileDeleter::~TempFileDeleter() {
+  std::ifstream file(file_name_.c_str());
+  if (file.good()) {
+    file.close();
+    std::remove(file_name_.c_str());
+  }
+}
+
+}  // namespace libwebm
--- a/third_party/libwebm/common/file_util.h
+++ b/third_party/libwebm/common/file_util.h
@ -0,0 +1,41 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_FILE_UTIL_H_
+#define LIBWEBM_COMMON_FILE_UTIL_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "mkvmuxer/mkvmuxertypes.h"  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN()
+
+namespace libwebm {
+
+// Returns a temporary file name.
+std::string GetTempFileName();
+
+// Returns size of file specified by |file_name|, or 0 upon failure.
+uint64_t GetFileSize(const std::string& file_name);
+
+// Manages life of temporary file specified at time of construction. Deletes
+// file upon destruction.
+class TempFileDeleter {
+ public:
+  TempFileDeleter();
+  explicit TempFileDeleter(std::string file_name) : file_name_(file_name) {}
+  ~TempFileDeleter();
+  const std::string& name() const { return file_name_; }
+
+ private:
+  std::string file_name_;
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TempFileDeleter);
+};
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
--- a/third_party/libwebm/common/hdr_util.cc
+++ b/third_party/libwebm/common/hdr_util.cc
@ -0,0 +1,182 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "hdr_util.h"
+
+#include <cstddef>
+#include <new>
+
+#include "mkvparser/mkvparser.h"
+
+namespace libwebm {
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc) {
+  muxer_pc->reset(new (std::nothrow)
+                      mkvmuxer::PrimaryChromaticity(parser_pc.x, parser_pc.y));
+  if (!muxer_pc->get())
+    return false;
+  return true;
+}
+
+bool MasteringMetadataValuePresent(double value) {
+  return value != mkvparser::MasteringMetadata::kValueNotPresent;
+}
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm) {
+  if (MasteringMetadataValuePresent(parser_mm.luminance_max))
+    muxer_mm->luminance_max = parser_mm.luminance_max;
+  if (MasteringMetadataValuePresent(parser_mm.luminance_min))
+    muxer_mm->luminance_min = parser_mm.luminance_min;
+
+  PrimaryChromaticityPtr r_ptr(NULL);
+  PrimaryChromaticityPtr g_ptr(NULL);
+  PrimaryChromaticityPtr b_ptr(NULL);
+  PrimaryChromaticityPtr wp_ptr(NULL);
+
+  if (parser_mm.r) {
+    if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
+      return false;
+  }
+  if (parser_mm.g) {
+    if (!CopyPrimaryChromaticity(*parser_mm.g, &g_ptr))
+      return false;
+  }
+  if (parser_mm.b) {
+    if (!CopyPrimaryChromaticity(*parser_mm.b, &b_ptr))
+      return false;
+  }
+  if (parser_mm.white_point) {
+    if (!CopyPrimaryChromaticity(*parser_mm.white_point, &wp_ptr))
+      return false;
+  }
+
+  if (!muxer_mm->SetChromaticity(r_ptr.get(), g_ptr.get(), b_ptr.get(),
+                                 wp_ptr.get())) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ColourValuePresent(long long value) {
+  return value != mkvparser::Colour::kValueNotPresent;
+}
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour) {
+  if (!muxer_colour)
+    return false;
+
+  if (ColourValuePresent(parser_colour.matrix_coefficients))
+    muxer_colour->matrix_coefficients = parser_colour.matrix_coefficients;
+  if (ColourValuePresent(parser_colour.bits_per_channel))
+    muxer_colour->bits_per_channel = parser_colour.bits_per_channel;
+  if (ColourValuePresent(parser_colour.chroma_subsampling_horz))
+    muxer_colour->chroma_subsampling_horz =
+        parser_colour.chroma_subsampling_horz;
+  if (ColourValuePresent(parser_colour.chroma_subsampling_vert))
+    muxer_colour->chroma_subsampling_vert =
+        parser_colour.chroma_subsampling_vert;
+  if (ColourValuePresent(parser_colour.cb_subsampling_horz))
+    muxer_colour->cb_subsampling_horz = parser_colour.cb_subsampling_horz;
+  if (ColourValuePresent(parser_colour.cb_subsampling_vert))
+    muxer_colour->cb_subsampling_vert = parser_colour.cb_subsampling_vert;
+  if (ColourValuePresent(parser_colour.chroma_siting_horz))
+    muxer_colour->chroma_siting_horz = parser_colour.chroma_siting_horz;
+  if (ColourValuePresent(parser_colour.chroma_siting_vert))
+    muxer_colour->chroma_siting_vert = parser_colour.chroma_siting_vert;
+  if (ColourValuePresent(parser_colour.range))
+    muxer_colour->range = parser_colour.range;
+  if (ColourValuePresent(parser_colour.transfer_characteristics))
+    muxer_colour->transfer_characteristics =
+        parser_colour.transfer_characteristics;
+  if (ColourValuePresent(parser_colour.primaries))
+    muxer_colour->primaries = parser_colour.primaries;
+  if (ColourValuePresent(parser_colour.max_cll))
+    muxer_colour->max_cll = parser_colour.max_cll;
+  if (ColourValuePresent(parser_colour.max_fall))
+    muxer_colour->max_fall = parser_colour.max_fall;
+
+  if (parser_colour.mastering_metadata) {
+    mkvmuxer::MasteringMetadata muxer_mm;
+    if (!CopyMasteringMetadata(*parser_colour.mastering_metadata, &muxer_mm))
+      return false;
+    if (!muxer_colour->SetMasteringMetadata(muxer_mm))
+      return false;
+  }
+  return true;
+}
+
+// Format of VPx private data:
+//
+//   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//  |    ID Byte    |             Length            |               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+               |
+//  |                                                               |
+//  :               Bytes 1..Length of Codec Feature                :
+//  |                                                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//
+// ID Byte Format
+// ID byte is an unsigned byte.
+//   0 1 2 3 4 5 6 7
+//  +-+-+-+-+-+-+-+-+
+//  |X|    ID       |
+//  +-+-+-+-+-+-+-+-+
+//
+// The X bit is reserved.
+//
+// Currently only profile level is supported. ID byte must be set to 1, and
+// length must be 1. Supported values are:
+//
+//   10: Level 1
+//   11: Level 1.1
+//   20: Level 2
+//   21: Level 2.1
+//   30: Level 3
+//   31: Level 3.1
+//   40: Level 4
+//   41: Level 4.1
+//   50: Level 5
+//   51: Level 5.1
+//   52: Level 5.2
+//   60: Level 6
+//   61: Level 6.1
+//   62: Level 6.2
+//
+// See the following link for more information:
+// http://www.webmproject.org/vp9/profiles/
+int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length) {
+  const int kVpxCodecPrivateLength = 3;
+  if (!private_data || length != kVpxCodecPrivateLength)
+    return 0;
+
+  const uint8_t id_byte = *private_data;
+  if (id_byte != 1)
+    return 0;
+
+  const int kVpxProfileLength = 1;
+  const uint8_t length_byte = private_data[1];
+  if (length_byte != kVpxProfileLength)
+    return 0;
+
+  const int level = static_cast<int>(private_data[2]);
+
+  const int kNumLevels = 14;
+  const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
+                                  41, 50, 51, 52, 60, 61, 62};
+
+  for (int i = 0; i < kNumLevels; ++i) {
+    if (level == levels[i])
+      return level;
+  }
+
+  return 0;
+}
+}  // namespace libwebm
--- a/third_party/libwebm/common/hdr_util.h
+++ b/third_party/libwebm/common/hdr_util.h
@ -0,0 +1,51 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_HDR_UTIL_H_
+#define LIBWEBM_COMMON_HDR_UTIL_H_
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "mkvmuxer/mkvmuxer.h"
+
+namespace mkvparser {
+struct Colour;
+struct MasteringMetadata;
+struct PrimaryChromaticity;
+}  // namespace mkvparser
+
+namespace libwebm {
+// Utility types and functions for working with the Colour element and its
+// children. Copiers return true upon success. Presence functions return true
+// when the specified element is present.
+
+// TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is
+// required by libwebm.
+
+typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc);
+
+bool MasteringMetadataValuePresent(double value);
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm);
+
+bool ColourValuePresent(long long value);
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour);
+
+// Returns VP9 profile upon success or 0 upon failure.
+int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length);
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_HDR_UTIL_H_
--- a/third_party/libwebm/common/webmids.h
+++ b/third_party/libwebm/common/webmids.h
@ -6,10 +6,10 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.

-#ifndef WEBMIDS_HPP
-#define WEBMIDS_HPP
+#ifndef COMMON_WEBMIDS_H_
+#define COMMON_WEBMIDS_H_

-namespace mkvmuxer {
+namespace libwebm {

 enum MkvId {
  kMkvEBML = 0x1A45DFA3,
@ -41,6 +41,7 @@ enum MkvId {
  kMkvTimecodeScale = 0x2AD7B1,
  kMkvDuration = 0x4489,
  kMkvDateUTC = 0x4461,
+  kMkvTitle = 0x7BA9,
  kMkvMuxingApp = 0x4D80,
  kMkvWritingApp = 0x5741,
  // Cluster
@ -94,6 +95,35 @@ enum MkvId {
  kMkvAspectRatioType = 0x54B3,
  kMkvFrameRate = 0x2383E3,
  // end video
+  // colour
+  kMkvColour = 0x55B0,
+  kMkvMatrixCoefficients = 0x55B1,
+  kMkvBitsPerChannel = 0x55B2,
+  kMkvChromaSubsamplingHorz = 0x55B3,
+  kMkvChromaSubsamplingVert = 0x55B4,
+  kMkvCbSubsamplingHorz = 0x55B5,
+  kMkvCbSubsamplingVert = 0x55B6,
+  kMkvChromaSitingHorz = 0x55B7,
+  kMkvChromaSitingVert = 0x55B8,
+  kMkvRange = 0x55B9,
+  kMkvTransferCharacteristics = 0x55BA,
+  kMkvPrimaries = 0x55BB,
+  kMkvMaxCLL = 0x55BC,
+  kMkvMaxFALL = 0x55BD,
+  // mastering metadata
+  kMkvMasteringMetadata = 0x55D0,
+  kMkvPrimaryRChromaticityX = 0x55D1,
+  kMkvPrimaryRChromaticityY = 0x55D2,
+  kMkvPrimaryGChromaticityX = 0x55D3,
+  kMkvPrimaryGChromaticityY = 0x55D4,
+  kMkvPrimaryBChromaticityX = 0x55D5,
+  kMkvPrimaryBChromaticityY = 0x55D6,
+  kMkvWhitePointChromaticityX = 0x55D7,
+  kMkvWhitePointChromaticityY = 0x55D8,
+  kMkvLuminanceMax = 0x55D9,
+  kMkvLuminanceMin = 0x55DA,
+  // end mastering metadata
+  // end colour
  // audio
  kMkvAudio = 0xE1,
  kMkvSamplingFrequency = 0xB5,
@ -107,9 +137,16 @@ enum MkvId {
  kMkvContentEncodingOrder = 0x5031,
  kMkvContentEncodingScope = 0x5032,
  kMkvContentEncodingType = 0x5033,
+  kMkvContentCompression = 0x5034,
+  kMkvContentCompAlgo = 0x4254,
+  kMkvContentCompSettings = 0x4255,
  kMkvContentEncryption = 0x5035,
  kMkvContentEncAlgo = 0x47E1,
  kMkvContentEncKeyID = 0x47E2,
+  kMkvContentSignature = 0x47E3,
+  kMkvContentSigKeyID = 0x47E4,
+  kMkvContentSigAlgo = 0x47E5,
+  kMkvContentSigHashAlgo = 0x47E6,
  kMkvContentEncAESSettings = 0x47E7,
  kMkvAESSettingsCipherMode = 0x47E8,
  kMkvAESSettingsCipherInitData = 0x47E9,
@ -133,9 +170,15 @@ enum MkvId {
  kMkvChapterDisplay = 0x80,
  kMkvChapString = 0x85,
  kMkvChapLanguage = 0x437C,
-  kMkvChapCountry = 0x437E
+  kMkvChapCountry = 0x437E,
+  // Tags
+  kMkvTags = 0x1254C367,
+  kMkvTag = 0x7373,
+  kMkvSimpleTag = 0x67C8,
+  kMkvTagName = 0x45A3,
+  kMkvTagString = 0x4487
 };

-}  // end namespace mkvmuxer
+}  // namespace libwebm

-#endif  // WEBMIDS_HPP
+#endif  // COMMON_WEBMIDS_H_
--- a/third_party/libwebm/mkvmuxer.cpp
+++ b/third_party/libwebm/mkvmuxer.cpp
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h
--- a/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
+++ b/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
@ -6,8 +6,17 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.

-#ifndef MKVMUXERTYPES_HPP
-#define MKVMUXERTYPES_HPP
+#ifndef MKVMUXER_MKVMUXERTYPES_H_
+#define MKVMUXER_MKVMUXERTYPES_H_
+
+namespace mkvmuxer {
+typedef unsigned char uint8;
+typedef short int16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+}  // namespace mkvmuxer

 // Copied from Chromium basictypes.h
 // A macro to disallow the copy constructor and operator= functions
@ -16,15 +25,4 @@
  TypeName(const TypeName&);                       \
  void operator=(const TypeName&)

-namespace mkvmuxer {
-
-typedef unsigned char uint8;
-typedef short int16;
-typedef int int32;
-typedef unsigned int uint32;
-typedef long long int64;
-typedef unsigned long long uint64;
-
-}  // end namespace mkvmuxer
-
-#endif  // MKVMUXERTYPES_HPP
+#endif  // MKVMUXER_MKVMUXERTYPES_HPP_
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@ -0,0 +1,650 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvmuxerutil.h"
+
+#ifdef __ANDROID__
+#include <fcntl.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <new>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvwriter.h"
+
+namespace mkvmuxer {
+
+namespace {
+
+// Date elements are always 8 octets in size.
+const int kDateElementSize = 8;
+
+uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
+                    int64_t timecode, uint64_t timecode_scale) {
+  uint64_t block_additional_elem_size = 0;
+  uint64_t block_addid_elem_size = 0;
+  uint64_t block_more_payload_size = 0;
+  uint64_t block_more_elem_size = 0;
+  uint64_t block_additions_payload_size = 0;
+  uint64_t block_additions_elem_size = 0;
+  if (frame->additional()) {
+    block_additional_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(),
+                        frame->additional_length());
+    block_addid_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockAddID, frame->add_id());
+
+    block_more_payload_size =
+        block_addid_elem_size + block_additional_elem_size;
+    block_more_elem_size =
+        EbmlMasterElementSize(libwebm::kMkvBlockMore, block_more_payload_size) +
+        block_more_payload_size;
+    block_additions_payload_size = block_more_elem_size;
+    block_additions_elem_size =
+        EbmlMasterElementSize(libwebm::kMkvBlockAdditions,
+                              block_additions_payload_size) +
+        block_additions_payload_size;
+  }
+
+  uint64_t discard_padding_elem_size = 0;
+  if (frame->discard_padding() != 0) {
+    discard_padding_elem_size =
+        EbmlElementSize(libwebm::kMkvDiscardPadding, frame->discard_padding());
+  }
+
+  const uint64_t reference_block_timestamp =
+      frame->reference_block_timestamp() / timecode_scale;
+  uint64_t reference_block_elem_size = 0;
+  if (!frame->is_key()) {
+    reference_block_elem_size =
+        EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp);
+  }
+
+  const uint64_t duration = frame->duration() / timecode_scale;
+  uint64_t block_duration_elem_size = 0;
+  if (duration > 0)
+    block_duration_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockDuration, duration);
+
+  const uint64_t block_payload_size = 4 + frame->length();
+  const uint64_t block_elem_size =
+      EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) +
+      block_payload_size;
+
+  const uint64_t block_group_payload_size =
+      block_elem_size + block_additions_elem_size + block_duration_elem_size +
+      discard_padding_elem_size + reference_block_elem_size;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockGroup,
+                              block_group_payload_size)) {
+    return 0;
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlock, block_payload_size))
+    return 0;
+
+  if (WriteUInt(writer, frame->track_number()))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  // For a Block, flags is always 0.
+  if (SerializeInt(writer, 0, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
+    return 0;
+
+  if (frame->additional()) {
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockAdditions,
+                                block_additions_payload_size)) {
+      return 0;
+    }
+
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockMore,
+                                block_more_payload_size))
+      return 0;
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, frame->add_id()))
+      return 0;
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional,
+                          frame->additional(), frame->additional_length())) {
+      return 0;
+    }
+  }
+
+  if (frame->discard_padding() != 0 &&
+      !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding,
+                        frame->discard_padding())) {
+    return false;
+  }
+
+  if (!frame->is_key() &&
+      !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+                        reference_block_timestamp)) {
+    return false;
+  }
+
+  if (duration > 0 &&
+      !WriteEbmlElement(writer, libwebm::kMkvBlockDuration, duration)) {
+    return false;
+  }
+  return EbmlMasterElementSize(libwebm::kMkvBlockGroup,
+                               block_group_payload_size) +
+         block_group_payload_size;
+}
+
+uint64_t WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                          int64_t timecode) {
+  if (WriteID(writer, libwebm::kMkvSimpleBlock))
+    return 0;
+
+  const int32_t size = static_cast<int32_t>(frame->length()) + 4;
+  if (WriteUInt(writer, size))
+    return 0;
+
+  if (WriteUInt(writer, static_cast<uint64_t>(frame->track_number())))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  uint64_t flags = 0;
+  if (frame->is_key())
+    flags |= 0x80;
+
+  if (SerializeInt(writer, flags, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
+    return 0;
+
+  return static_cast<uint64_t>(GetUIntSize(libwebm::kMkvSimpleBlock) +
+                               GetCodedUIntSize(size) + 4 + frame->length());
+}
+
+}  // namespace
+
+int32_t GetCodedUIntSize(uint64_t value) {
+  if (value < 0x000000000000007FULL)
+    return 1;
+  else if (value < 0x0000000000003FFFULL)
+    return 2;
+  else if (value < 0x00000000001FFFFFULL)
+    return 3;
+  else if (value < 0x000000000FFFFFFFULL)
+    return 4;
+  else if (value < 0x00000007FFFFFFFFULL)
+    return 5;
+  else if (value < 0x000003FFFFFFFFFFULL)
+    return 6;
+  else if (value < 0x0001FFFFFFFFFFFFULL)
+    return 7;
+  return 8;
+}
+
+int32_t GetUIntSize(uint64_t value) {
+  if (value < 0x0000000000000100ULL)
+    return 1;
+  else if (value < 0x0000000000010000ULL)
+    return 2;
+  else if (value < 0x0000000001000000ULL)
+    return 3;
+  else if (value < 0x0000000100000000ULL)
+    return 4;
+  else if (value < 0x0000010000000000ULL)
+    return 5;
+  else if (value < 0x0001000000000000ULL)
+    return 6;
+  else if (value < 0x0100000000000000ULL)
+    return 7;
+  return 8;
+}
+
+int32_t GetIntSize(int64_t value) {
+  // Doubling the requested value ensures positive values with their high bit
+  // set are written with 0-padding to avoid flipping the signedness.
+  const uint64_t v = (value < 0) ? value ^ -1LL : value;
+  return GetUIntSize(2 * v);
+}
+
+uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value) {
+  // Size of EBML ID
+  int32_t ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetCodedUIntSize(value);
+
+  return static_cast<uint64_t>(ebml_size);
+}
+
+uint64_t EbmlElementSize(uint64_t type, int64_t value) {
+  // Size of EBML ID
+  int32_t ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return static_cast<uint64_t>(ebml_size);
+}
+
+uint64_t EbmlElementSize(uint64_t type, uint64_t value) {
+  return EbmlElementSize(type, value, 0);
+}
+
+uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) {
+  // Size of EBML ID
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+
+  // Datasize
+  ebml_size +=
+      (fixed_size > 0) ? fixed_size : static_cast<uint64_t>(GetUIntSize(value));
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64_t EbmlElementSize(uint64_t type, float /* value */) {
+  // Size of EBML ID
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+
+  // Datasize
+  ebml_size += sizeof(float);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64_t EbmlElementSize(uint64_t type, const char* value) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+
+  // Datasize
+  ebml_size += strlen(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+
+  // Datasize
+  ebml_size += size;
+
+  // Size of Datasize
+  ebml_size += GetCodedUIntSize(size);
+
+  return ebml_size;
+}
+
+uint64_t EbmlDateElementSize(uint64_t type) {
+  // Size of EBML ID
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+
+  // Datasize
+  ebml_size += kDateElementSize;
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) {
+  if (!writer || size < 1 || size > 8)
+    return -1;
+
+  for (int32_t i = 1; i <= size; ++i) {
+    const int32_t byte_count = size - i;
+    const int32_t bit_count = byte_count * 8;
+
+    const int64_t bb = value >> bit_count;
+    const uint8_t b = static_cast<uint8_t>(bb);
+
+    const int32_t status = writer->Write(&b, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32_t SerializeFloat(IMkvWriter* writer, float f) {
+  if (!writer)
+    return -1;
+
+  assert(sizeof(uint32_t) == sizeof(float));
+  // This union is merely used to avoid a reinterpret_cast from float& to
+  // uint32& which will result in violation of strict aliasing.
+  union U32 {
+    uint32_t u32;
+    float f;
+  } value;
+  value.f = f;
+
+  for (int32_t i = 1; i <= 4; ++i) {
+    const int32_t byte_count = 4 - i;
+    const int32_t bit_count = byte_count * 8;
+
+    const uint8_t byte = static_cast<uint8_t>(value.u32 >> bit_count);
+
+    const int32_t status = writer->Write(&byte, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32_t WriteUInt(IMkvWriter* writer, uint64_t value) {
+  if (!writer)
+    return -1;
+
+  int32_t size = GetCodedUIntSize(value);
+
+  return WriteUIntSize(writer, value, size);
+}
+
+int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
+  if (!writer || size < 0 || size > 8)
+    return -1;
+
+  if (size > 0) {
+    const uint64_t bit = 1LL << (size * 7);
+
+    if (value > (bit - 2))
+      return -1;
+
+    value |= bit;
+  } else {
+    size = 1;
+    int64_t bit;
+
+    for (;;) {
+      bit = 1LL << (size * 7);
+      const uint64_t max = bit - 2;
+
+      if (value <= max)
+        break;
+
+      ++size;
+    }
+
+    if (size > 8)
+      return false;
+
+    value |= bit;
+  }
+
+  return SerializeInt(writer, value, size);
+}
+
+int32_t WriteID(IMkvWriter* writer, uint64_t type) {
+  if (!writer)
+    return -1;
+
+  writer->ElementStartNotify(type, writer->Position());
+
+  const int32_t size = GetUIntSize(type);
+
+  return SerializeInt(writer, type, size);
+}
+
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value) {
+  return WriteEbmlElement(writer, type, value, 0);
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
+                      uint64_t fixed_size) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  uint64_t size = static_cast<uint64_t>(GetUIntSize(value));
+  if (fixed_size > 0) {
+    if (size > fixed_size)
+      return false;
+    size = fixed_size;
+  }
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return 0;
+
+  const uint64_t size = GetIntSize(value);
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, 4))
+    return false;
+
+  if (SerializeFloat(writer, value))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value) {
+  if (!writer || !value)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  const uint64_t length = strlen(value);
+  if (WriteUInt(writer, length))
+    return false;
+
+  if (writer->Write(value, static_cast<const uint32_t>(length)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
+                      uint64_t size) {
+  if (!writer || !value || size < 1)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (writer->Write(value, static_cast<uint32_t>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, kDateElementSize))
+    return false;
+
+  if (SerializeInt(writer, value, kDateElementSize))
+    return false;
+
+  return true;
+}
+
+uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                    Cluster* cluster) {
+  if (!writer || !frame || !frame->IsValid() || !cluster ||
+      !cluster->timecode_scale())
+    return 0;
+
+  //  Technically the timecode for a block can be less than the
+  //  timecode for the cluster itself (remember that block timecode
+  //  is a signed, 16-bit integer).  However, as a simplification we
+  //  only permit non-negative cluster-relative timecodes for blocks.
+  const int64_t relative_timecode = cluster->GetRelativeTimecode(
+      frame->timestamp() / cluster->timecode_scale());
+  if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
+    return 0;
+
+  return frame->CanBeSimpleBlock() ?
+             WriteSimpleBlock(writer, frame, relative_timecode) :
+             WriteBlock(writer, frame, relative_timecode,
+                        cluster->timecode_scale());
+}
+
+uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) {
+  if (!writer)
+    return false;
+
+  // Subtract one for the void ID and the coded size.
+  uint64_t void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64_t void_size =
+      EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
+      void_entry_size;
+
+  if (void_size != size)
+    return 0;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return 0;
+
+  if (WriteID(writer, libwebm::kMkvVoid))
+    return 0;
+
+  if (WriteUInt(writer, void_entry_size))
+    return 0;
+
+  const uint8_t value = 0;
+  for (int32_t i = 0; i < static_cast<int32_t>(void_entry_size); ++i) {
+    if (writer->Write(&value, 1))
+      return 0;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(void_size))
+    return 0;
+
+  return void_size;
+}
+
+void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
+                int32_t* revision) {
+  *major = 0;
+  *minor = 2;
+  *build = 1;
+  *revision = 0;
+}
+
+uint64_t MakeUID(unsigned int* seed) {
+  uint64_t uid = 0;
+
+#ifdef __MINGW32__
+  srand(*seed);
+#endif
+
+  for (int i = 0; i < 7; ++i) {  // avoid problems with 8-byte values
+    uid <<= 8;
+
+// TODO(fgalligan): Move random number generation to platform specific code.
+#ifdef _MSC_VER
+    (void)seed;
+    const int32_t nn = rand();
+#elif __ANDROID__
+    int32_t temp_num = 1;
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd != -1) {
+      read(fd, &temp_num, sizeof(temp_num));
+      close(fd);
+    }
+    const int32_t nn = temp_num;
+#elif defined __MINGW32__
+    const int32_t nn = rand();
+#else
+    const int32_t nn = rand_r(seed);
+#endif
+    const int32_t n = 0xFF & (nn >> 4);  // throw away low-order bits
+
+    uid |= n;
+  }
+
+  return uid;
+}
+
+}  // namespace mkvmuxer
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@ -0,0 +1,95 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVMUXER_MKVMUXERUTIL_H_
+#define MKVMUXER_MKVMUXERUTIL_H_
+
+#include <stdint.h>
+
+namespace mkvmuxer {
+class Cluster;
+class Frame;
+class IMkvWriter;
+
+const uint64_t kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64_t kMaxBlockTimecode = 0x07FFFLL;
+
+// Writes out |value| in Big Endian order. Returns 0 on success.
+int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size);
+
+// Returns the size in bytes of the element.
+int32_t GetUIntSize(uint64_t value);
+int32_t GetIntSize(int64_t value);
+int32_t GetCodedUIntSize(uint64_t value);
+uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value);
+uint64_t EbmlElementSize(uint64_t type, int64_t value);
+uint64_t EbmlElementSize(uint64_t type, uint64_t value);
+uint64_t EbmlElementSize(uint64_t type, float value);
+uint64_t EbmlElementSize(uint64_t type, const char* value);
+uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size);
+uint64_t EbmlDateElementSize(uint64_t type);
+
+// Returns the size in bytes of the element assuming that the element was
+// written using |fixed_size| bytes. If |fixed_size| is set to zero, then it
+// computes the necessary number of bytes based on |value|.
+uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |value|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32_t WriteUInt(IMkvWriter* writer, uint64_t value);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |size|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size);
+
+// Output an Mkv master element. Returns true if the element was written.
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t value, uint64_t size);
+
+// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
+// ID to |SerializeInt|. Returns 0 on success.
+int32_t WriteID(IMkvWriter* writer, uint64_t type);
+
+// Output an Mkv non-master element. Returns true if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
+                      uint64_t size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value);
+
+// Output an Mkv non-master element using fixed size. The element will be
+// written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero
+// then it computes the necessary number of bytes based on |value|. Returns true
+// if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
+                      uint64_t fixed_size);
+
+// Output a Mkv Frame. It decides the correct element to write (Block vs
+// SimpleBlock) based on the parameters of the Frame.
+uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                    Cluster* cluster);
+
+// Output a void element. |size| must be the entire size in bytes that will be
+// void. The function will calculate the size of the void header and subtract
+// it from |size|.
+uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size);
+
+// Returns the version number of the muxer in |major|, |minor|, |build|,
+// and |revision|.
+void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
+                int32_t* revision);
+
+// Returns a random number to be used for UID, using |seed| to seed
+// the random-number generator (see POSIX rand_r() for semantics).
+uint64_t MakeUID(unsigned int* seed);
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVMUXERUTIL_H_
--- a/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/third_party/libwebm/mkvmuxer/mkvwriter.cc
@ -6,14 +6,12 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.

-#include "mkvwriter.hpp"
+#include "mkvmuxer/mkvwriter.h"

 #ifdef _MSC_VER
 #include <share.h>  // for _SH_DENYWR
 #endif

-#include <new>
-
 namespace mkvmuxer {

 MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
--- a/third_party/libwebm/mkvmuxer/mkvwriter.h
+++ b/third_party/libwebm/mkvmuxer/mkvwriter.h
@ -6,13 +6,13 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.

-#ifndef MKVWRITER_HPP
-#define MKVWRITER_HPP
+#ifndef MKVMUXER_MKVWRITER_H_
+#define MKVMUXER_MKVWRITER_H_

 #include <stdio.h>

-#include "mkvmuxer.hpp"
-#include "mkvmuxertypes.hpp"
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvmuxertypes.h"

 namespace mkvmuxer {

@ -46,6 +46,6 @@ class MkvWriter : public IMkvWriter {
  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter);
 };

-}  // end namespace mkvmuxer
+}  // namespace mkvmuxer

-#endif  // MKVWRITER_HPP
+#endif  // MKVMUXER_MKVWRITER_H_
--- a/third_party/libwebm/mkvmuxerutil.cpp
+++ b/third_party/libwebm/mkvmuxerutil.cpp
@ -1,724 +0,0 @@
-// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-#include "mkvmuxerutil.hpp"
-
-#ifdef __ANDROID__
-#include <fcntl.h>
-#endif
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#ifdef _MSC_VER
-#define _CRT_RAND_S
-#endif
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-
-#include <new>
-
-#include "mkvwriter.hpp"
-#include "webmids.hpp"
-
-namespace mkvmuxer {
-
-namespace {
-
-// Date elements are always 8 octets in size.
-const int kDateElementSize = 8;
-
-}  // namespace
-
-int32 GetCodedUIntSize(uint64 value) {
-  if (value < 0x000000000000007FULL)
-    return 1;
-  else if (value < 0x0000000000003FFFULL)
-    return 2;
-  else if (value < 0x00000000001FFFFFULL)
-    return 3;
-  else if (value < 0x000000000FFFFFFFULL)
-    return 4;
-  else if (value < 0x00000007FFFFFFFFULL)
-    return 5;
-  else if (value < 0x000003FFFFFFFFFFULL)
-    return 6;
-  else if (value < 0x0001FFFFFFFFFFFFULL)
-    return 7;
-  return 8;
-}
-
-int32 GetUIntSize(uint64 value) {
-  if (value < 0x0000000000000100ULL)
-    return 1;
-  else if (value < 0x0000000000010000ULL)
-    return 2;
-  else if (value < 0x0000000001000000ULL)
-    return 3;
-  else if (value < 0x0000000100000000ULL)
-    return 4;
-  else if (value < 0x0000010000000000ULL)
-    return 5;
-  else if (value < 0x0001000000000000ULL)
-    return 6;
-  else if (value < 0x0100000000000000ULL)
-    return 7;
-  return 8;
-}
-
-uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
-  // Size of EBML ID
-  int32 ebml_size = GetUIntSize(type);
-
-  // Datasize
-  ebml_size += GetCodedUIntSize(value);
-
-  return ebml_size;
-}
-
-uint64 EbmlElementSize(uint64 type, int64 value) {
-  return EbmlElementSize(type, static_cast<uint64>(value));
-}
-
-uint64 EbmlElementSize(uint64 type, uint64 value) {
-  // Size of EBML ID
-  int32 ebml_size = GetUIntSize(type);
-
-  // Datasize
-  ebml_size += GetUIntSize(value);
-
-  // Size of Datasize
-  ebml_size++;
-
-  return ebml_size;
-}
-
-uint64 EbmlElementSize(uint64 type, float /* value */) {
-  // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
-
-  // Datasize
-  ebml_size += sizeof(float);
-
-  // Size of Datasize
-  ebml_size++;
-
-  return ebml_size;
-}
-
-uint64 EbmlElementSize(uint64 type, const char* value) {
-  if (!value)
-    return 0;
-
-  // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
-
-  // Datasize
-  ebml_size += strlen(value);
-
-  // Size of Datasize
-  ebml_size++;
-
-  return ebml_size;
-}
-
-uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
-  if (!value)
-    return 0;
-
-  // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
-
-  // Datasize
-  ebml_size += size;
-
-  // Size of Datasize
-  ebml_size += GetCodedUIntSize(size);
-
-  return ebml_size;
-}
-
-uint64 EbmlDateElementSize(uint64 type, int64 value) {
-  // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
-
-  // Datasize
-  ebml_size += kDateElementSize;
-
-  // Size of Datasize
-  ebml_size++;
-
-  return ebml_size;
-}
-
-int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
-  if (!writer || size < 1 || size > 8)
-    return -1;
-
-  for (int32 i = 1; i <= size; ++i) {
-    const int32 byte_count = size - i;
-    const int32 bit_count = byte_count * 8;
-
-    const int64 bb = value >> bit_count;
-    const uint8 b = static_cast<uint8>(bb);
-
-    const int32 status = writer->Write(&b, 1);
-
-    if (status < 0)
-      return status;
-  }
-
-  return 0;
-}
-
-int32 SerializeFloat(IMkvWriter* writer, float f) {
-  if (!writer)
-    return -1;
-
-  assert(sizeof(uint32) == sizeof(float));
-  // This union is merely used to avoid a reinterpret_cast from float& to
-  // uint32& which will result in violation of strict aliasing.
-  union U32 {
-    uint32 u32;
-    float f;
-  } value;
-  value.f = f;
-
-  for (int32 i = 1; i <= 4; ++i) {
-    const int32 byte_count = 4 - i;
-    const int32 bit_count = byte_count * 8;
-
-    const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
-
-    const int32 status = writer->Write(&byte, 1);
-
-    if (status < 0)
-      return status;
-  }
-
-  return 0;
-}
-
-int32 WriteUInt(IMkvWriter* writer, uint64 value) {
-  if (!writer)
-    return -1;
-
-  int32 size = GetCodedUIntSize(value);
-
-  return WriteUIntSize(writer, value, size);
-}
-
-int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
-  if (!writer || size < 0 || size > 8)
-    return -1;
-
-  if (size > 0) {
-    const uint64 bit = 1LL << (size * 7);
-
-    if (value > (bit - 2))
-      return -1;
-
-    value |= bit;
-  } else {
-    size = 1;
-    int64 bit;
-
-    for (;;) {
-      bit = 1LL << (size * 7);
-      const uint64 max = bit - 2;
-
-      if (value <= max)
-        break;
-
-      ++size;
-    }
-
-    if (size > 8)
-      return false;
-
-    value |= bit;
-  }
-
-  return SerializeInt(writer, value, size);
-}
-
-int32 WriteID(IMkvWriter* writer, uint64 type) {
-  if (!writer)
-    return -1;
-
-  writer->ElementStartNotify(type, writer->Position());
-
-  const int32 size = GetUIntSize(type);
-
-  return SerializeInt(writer, type, size);
-}
-
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
-  if (!writer)
-    return false;
-
-  if (WriteID(writer, type))
-    return false;
-
-  if (WriteUInt(writer, size))
-    return false;
-
-  return true;
-}
-
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
-  if (!writer)
-    return false;
-
-  if (WriteID(writer, type))
-    return false;
-
-  const uint64 size = GetUIntSize(value);
-  if (WriteUInt(writer, size))
-    return false;
-
-  if (SerializeInt(writer, value, static_cast<int32>(size)))
-    return false;
-
-  return true;
-}
-
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
-  if (!writer)
-    return false;
-
-  if (WriteID(writer, type))
-    return false;
-
-  if (WriteUInt(writer, 4))
-    return false;
-
-  if (SerializeFloat(writer, value))
-    return false;
-
-  return true;
-}
-
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
-  if (!writer || !value)
-    return false;
-
-  if (WriteID(writer, type))
-    return false;
-
-  const uint64 length = strlen(value);
-  if (WriteUInt(writer, length))
-    return false;
-
-  if (writer->Write(value, static_cast<const uint32>(length)))
-    return false;
-
-  return true;
-}
-
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
-                      uint64 size) {
-  if (!writer || !value || size < 1)
-    return false;
-
-  if (WriteID(writer, type))
-    return false;
-
-  if (WriteUInt(writer, size))
-    return false;
-
-  if (writer->Write(value, static_cast<uint32>(size)))
-    return false;
-
-  return true;
-}
-
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
-  if (!writer)
-    return false;
-
-  if (WriteID(writer, type))
-    return false;
-
-  if (WriteUInt(writer, kDateElementSize))
-    return false;
-
-  if (SerializeInt(writer, value, kDateElementSize))
-    return false;
-
-  return true;
-}
-
-uint64 WriteSimpleBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                        uint64 track_number, int64 timecode, uint64 is_key) {
-  if (!writer)
-    return false;
-
-  if (!data || length < 1)
-    return false;
-
-  //  Here we only permit track number values to be no greater than
-  //  126, which the largest value we can store having a Matroska
-  //  integer representation of only 1 byte.
-
-  if (track_number < 1 || track_number > 126)
-    return false;
-
-  //  Technically the timestamp for a block can be less than the
-  //  timestamp for the cluster itself (remember that block timestamp
-  //  is a signed, 16-bit integer).  However, as a simplification we
-  //  only permit non-negative cluster-relative timestamps for blocks.
-
-  if (timecode < 0 || timecode > kMaxBlockTimecode)
-    return false;
-
-  if (WriteID(writer, kMkvSimpleBlock))
-    return 0;
-
-  const int32 size = static_cast<int32>(length) + 4;
-  if (WriteUInt(writer, size))
-    return 0;
-
-  if (WriteUInt(writer, static_cast<uint64>(track_number)))
-    return 0;
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  uint64 flags = 0;
-  if (is_key)
-    flags |= 0x80;
-
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  const uint64 element_size =
-      GetUIntSize(kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 + length;
-
-  return element_size;
-}
-
-// We must write the metadata (key)frame as a BlockGroup element,
-// because we need to specify a duration for the frame.  The
-// BlockGroup element comprises the frame itself and its duration,
-// and is laid out as follows:
-//
-//   BlockGroup tag
-//   BlockGroup size
-//     Block tag
-//     Block size
-//     (the frame is the block payload)
-//     Duration tag
-//     Duration size
-//     (duration payload)
-//
-uint64 WriteMetadataBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                          uint64 track_number, int64 timecode,
-                          uint64 duration) {
-  // We don't backtrack when writing to the stream, so we must
-  // pre-compute the BlockGroup size, by summing the sizes of each
-  // sub-element (the block and the duration).
-
-  // We use a single byte for the track number of the block, which
-  // means the block header is exactly 4 bytes.
-
-  // TODO(matthewjheaney): use EbmlMasterElementSize and WriteEbmlMasterElement
-
-  const uint64 block_payload_size = 4 + length;
-  const int32 block_size = GetCodedUIntSize(block_payload_size);
-  const uint64 block_elem_size = 1 + block_size + block_payload_size;
-
-  const int32 duration_payload_size = GetUIntSize(duration);
-  const int32 duration_size = GetCodedUIntSize(duration_payload_size);
-  const uint64 duration_elem_size = 1 + duration_size + duration_payload_size;
-
-  const uint64 blockg_payload_size = block_elem_size + duration_elem_size;
-  const int32 blockg_size = GetCodedUIntSize(blockg_payload_size);
-  const uint64 blockg_elem_size = 1 + blockg_size + blockg_payload_size;
-
-  if (WriteID(writer, kMkvBlockGroup))  // 1-byte ID size
-    return 0;
-
-  if (WriteUInt(writer, blockg_payload_size))
-    return 0;
-
-  //  Write Block element
-
-  if (WriteID(writer, kMkvBlock))  // 1-byte ID size
-    return 0;
-
-  if (WriteUInt(writer, block_payload_size))
-    return 0;
-
-  // Byte 1 of 4
-
-  if (WriteUInt(writer, track_number))
-    return 0;
-
-  // Bytes 2 & 3 of 4
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  // Byte 4 of 4
-
-  const uint64 flags = 0;
-
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  // Now write the actual frame (of metadata)
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  // Write Duration element
-
-  if (WriteID(writer, kMkvBlockDuration))  // 1-byte ID size
-    return 0;
-
-  if (WriteUInt(writer, duration_payload_size))
-    return 0;
-
-  if (SerializeInt(writer, duration, duration_payload_size))
-    return 0;
-
-  // Note that we don't write a reference time as part of the block
-  // group; no reference time(s) indicates that this block is a
-  // keyframe.  (Unlike the case for a SimpleBlock element, the header
-  // bits of the Block sub-element of a BlockGroup element do not
-  // indicate keyframe status.  The keyframe status is inferred from
-  // the absence of reference time sub-elements.)
-
-  return blockg_elem_size;
-}
-
-// Writes a WebM BlockGroup with BlockAdditional data. The structure is as
-// follows:
-// Indentation shows sub-levels
-// BlockGroup
-//  Block
-//    Data
-//  BlockAdditions
-//    BlockMore
-//      BlockAddID
-//        1 (Denotes Alpha)
-//      BlockAdditional
-//        Data
-uint64 WriteBlockWithAdditional(IMkvWriter* writer, const uint8* data,
-                                uint64 length, const uint8* additional,
-                                uint64 additional_length, uint64 add_id,
-                                uint64 track_number, int64 timecode,
-                                uint64 is_key) {
-  if (!data || !additional || length < 1 || additional_length < 1)
-    return 0;
-
-  const uint64 block_payload_size = 4 + length;
-  const uint64 block_elem_size =
-      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
-  const uint64 block_additional_elem_size =
-      EbmlElementSize(kMkvBlockAdditional, additional, additional_length);
-  const uint64 block_addid_elem_size = EbmlElementSize(kMkvBlockAddID, add_id);
-
-  const uint64 block_more_payload_size =
-      block_addid_elem_size + block_additional_elem_size;
-  const uint64 block_more_elem_size =
-      EbmlMasterElementSize(kMkvBlockMore, block_more_payload_size) +
-      block_more_payload_size;
-  const uint64 block_additions_payload_size = block_more_elem_size;
-  const uint64 block_additions_elem_size =
-      EbmlMasterElementSize(kMkvBlockAdditions, block_additions_payload_size) +
-      block_additions_payload_size;
-  const uint64 block_group_payload_size =
-      block_elem_size + block_additions_elem_size;
-  const uint64 block_group_elem_size =
-      EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
-      block_group_payload_size;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup, block_group_payload_size))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
-    return 0;
-
-  if (WriteUInt(writer, track_number))
-    return 0;
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  uint64 flags = 0;
-  if (is_key)
-    flags |= 0x80;
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockAdditions,
-                              block_additions_payload_size))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockMore, block_more_payload_size))
-    return 0;
-
-  if (!WriteEbmlElement(writer, kMkvBlockAddID, add_id))
-    return 0;
-
-  if (!WriteEbmlElement(writer, kMkvBlockAdditional, additional,
-                        additional_length))
-    return 0;
-
-  return block_group_elem_size;
-}
-
-// Writes a WebM BlockGroup with DiscardPadding. The structure is as follows:
-// Indentation shows sub-levels
-// BlockGroup
-//  Block
-//    Data
-//  DiscardPadding
-uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer, const uint8* data,
-                                    uint64 length, int64 discard_padding,
-                                    uint64 track_number, int64 timecode,
-                                    uint64 is_key) {
-  if (!data || length < 1 || discard_padding <= 0)
-    return 0;
-
-  const uint64 block_payload_size = 4 + length;
-  const uint64 block_elem_size =
-      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
-  const uint64 discard_padding_elem_size =
-      EbmlElementSize(kMkvDiscardPadding, discard_padding);
-  const uint64 block_group_payload_size =
-      block_elem_size + discard_padding_elem_size;
-  const uint64 block_group_elem_size =
-      EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
-      block_group_payload_size;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup, block_group_payload_size))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
-    return 0;
-
-  if (WriteUInt(writer, track_number))
-    return 0;
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  uint64 flags = 0;
-  if (is_key)
-    flags |= 0x80;
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  if (WriteID(writer, kMkvDiscardPadding))
-    return 0;
-
-  const uint64 size = GetUIntSize(discard_padding);
-  if (WriteUInt(writer, size))
-    return false;
-
-  if (SerializeInt(writer, discard_padding, static_cast<int32>(size)))
-    return false;
-
-  return block_group_elem_size;
-}
-
-uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
-  if (!writer)
-    return false;
-
-  // Subtract one for the void ID and the coded size.
-  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
-  uint64 void_size =
-      EbmlMasterElementSize(kMkvVoid, void_entry_size) + void_entry_size;
-
-  if (void_size != size)
-    return 0;
-
-  const int64 payload_position = writer->Position();
-  if (payload_position < 0)
-    return 0;
-
-  if (WriteID(writer, kMkvVoid))
-    return 0;
-
-  if (WriteUInt(writer, void_entry_size))
-    return 0;
-
-  const uint8 value = 0;
-  for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
-    if (writer->Write(&value, 1))
-      return 0;
-  }
-
-  const int64 stop_position = writer->Position();
-  if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(void_size))
-    return 0;
-
-  return void_size;
-}
-
-void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
-  *major = 0;
-  *minor = 2;
-  *build = 1;
-  *revision = 0;
-}
-
-}  // namespace mkvmuxer
-
-mkvmuxer::uint64 mkvmuxer::MakeUID(unsigned int* seed) {
-  uint64 uid = 0;
-
-#ifdef __MINGW32__
-  srand(*seed);
-#endif
-
-  for (int i = 0; i < 7; ++i) {  // avoid problems with 8-byte values
-    uid <<= 8;
-
-// TODO(fgalligan): Move random number generation to platform specific code.
-#ifdef _MSC_VER
-    (void)seed;
-    unsigned int random_value;
-    const errno_t e = rand_s(&random_value);
-    (void)e;
-    const int32 nn = random_value;
-#elif __ANDROID__
-    int32 temp_num = 1;
-    int fd = open("/dev/urandom", O_RDONLY);
-    if (fd != -1) {
-      read(fd, &temp_num, sizeof(int32));
-      close(fd);
-    }
-    const int32 nn = temp_num;
-#elif defined __MINGW32__
-    const int32 nn = rand();
-#else
-    const int32 nn = rand_r(seed);
-#endif
-    const int32 n = 0xFF & (nn >> 4);  // throw away low-order bits
-
-    uid |= n;
-  }
-
-  return uid;
-}
--- a/third_party/libwebm/mkvmuxerutil.hpp
+++ b/third_party/libwebm/mkvmuxerutil.hpp
@ -1,137 +0,0 @@
-// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-#ifndef MKVMUXERUTIL_HPP
-#define MKVMUXERUTIL_HPP
-
-#include "mkvmuxertypes.hpp"
-
-namespace mkvmuxer {
-
-class IMkvWriter;
-
-const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
-const int64 kMaxBlockTimecode = 0x07FFFLL;
-
-// Writes out |value| in Big Endian order. Returns 0 on success.
-int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
-
-// Returns the size in bytes of the element.
-int32 GetUIntSize(uint64 value);
-int32 GetCodedUIntSize(uint64 value);
-uint64 EbmlMasterElementSize(uint64 type, uint64 value);
-uint64 EbmlElementSize(uint64 type, int64 value);
-uint64 EbmlElementSize(uint64 type, uint64 value);
-uint64 EbmlElementSize(uint64 type, float value);
-uint64 EbmlElementSize(uint64 type, const char* value);
-uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
-uint64 EbmlDateElementSize(uint64 type, int64 value);
-
-// Creates an EBML coded number from |value| and writes it out. The size of
-// the coded number is determined by the value of |value|. |value| must not
-// be in a coded form. Returns 0 on success.
-int32 WriteUInt(IMkvWriter* writer, uint64 value);
-
-// Creates an EBML coded number from |value| and writes it out. The size of
-// the coded number is determined by the value of |size|. |value| must not
-// be in a coded form. Returns 0 on success.
-int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
-
-// Output an Mkv master element. Returns true if the element was written.
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
-
-// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
-// ID to |SerializeInt|. Returns 0 on success.
-int32 WriteID(IMkvWriter* writer, uint64 type);
-
-// Output an Mkv non-master element. Returns true if the element was written.
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
-                      uint64 size);
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
-
-// Output an Mkv Simple Block.
-// Inputs:
-//   data:         Pointer to the data.
-//   length:       Length of the data.
-//   track_number: Track to add the data to. Value returned by Add track
-//                  functions.  Only values in the range [1, 126] are
-//                  permitted.
-//   timecode:     Relative timecode of the Block.  Only values in the
-//                  range [0, 2^15) are permitted.
-//   is_key:       Non-zero value specifies that frame is a key frame.
-uint64 WriteSimpleBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                        uint64 track_number, int64 timecode, uint64 is_key);
-
-// Output a metadata keyframe, using a Block Group element.
-// Inputs:
-//   data:         Pointer to the (meta)data.
-//   length:       Length of the (meta)data.
-//   track_number: Track to add the data to. Value returned by Add track
-//                  functions.  Only values in the range [1, 126] are
-//                  permitted.
-//   timecode      Timecode of frame, relative to cluster timecode.  Only
-//                  values in the range [0, 2^15) are permitted.
-//   duration_timecode  Duration of frame, using timecode units.
-uint64 WriteMetadataBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                          uint64 track_number, int64 timecode,
-                          uint64 duration_timecode);
-
-// Output an Mkv Block with BlockAdditional data.
-// Inputs:
-//   data:         Pointer to the data.
-//   length:       Length of the data.
-//   additional:   Pointer to the additional data
-//   additional_length: Length of the additional data.
-//   add_id: Value of BlockAddID element.
-//   track_number: Track to add the data to. Value returned by Add track
-//                  functions.  Only values in the range [1, 126] are
-//                  permitted.
-//   timecode:     Relative timecode of the Block.  Only values in the
-//                  range [0, 2^15) are permitted.
-//   is_key:       Non-zero value specifies that frame is a key frame.
-uint64 WriteBlockWithAdditional(IMkvWriter* writer, const uint8* data,
-                                uint64 length, const uint8* additional,
-                                uint64 additional_length, uint64 add_id,
-                                uint64 track_number, int64 timecode,
-                                uint64 is_key);
-
-// Output an Mkv Block with a DiscardPadding element.
-// Inputs:
-//   data:            Pointer to the data.
-//   length:          Length of the data.
-//   discard_padding: DiscardPadding value.
-//   track_number:    Track to add the data to. Value returned by Add track
-//                    functions. Only values in the range [1, 126] are
-//                    permitted.
-//   timecode:        Relative timecode of the Block.  Only values in the
-//                    range [0, 2^15) are permitted.
-//   is_key:          Non-zero value specifies that frame is a key frame.
-uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer, const uint8* data,
-                                    uint64 length, int64 discard_padding,
-                                    uint64 track_number, int64 timecode,
-                                    uint64 is_key);
-
-// Output a void element. |size| must be the entire size in bytes that will be
-// void. The function will calculate the size of the void header and subtract
-// it from |size|.
-uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
-
-// Returns the version number of the muxer in |major|, |minor|, |build|,
-// and |revision|.
-void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
-
-// Returns a random number to be used for UID, using |seed| to seed
-// the random-number generator (see POSIX rand_r() for semantics).
-uint64 MakeUID(unsigned int* seed);
-
-}  // end namespace mkvmuxer
-
-#endif  // MKVMUXERUTIL_HPP
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
--- a/third_party/libwebm/mkvparser/mkvparser.h
+++ b/third_party/libwebm/mkvparser/mkvparser.h
@ -5,16 +5,14 @@
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVPARSER_H_
+#define MKVPARSER_MKVPARSER_H_

-#ifndef MKVPARSER_HPP
-#define MKVPARSER_HPP
-
-#include <cstdlib>
-#include <cstdio>
 #include <cstddef>

 namespace mkvparser {

+const int E_PARSE_FAILED = -1;
 const int E_FILE_FORMAT_INVALID = -2;
 const int E_BUFFER_NOT_FULL = -3;

@ -27,12 +25,17 @@ class IMkvReader {
  virtual ~IMkvReader();
 };

+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+                     unsigned long long element_size);
 long long GetUIntLength(IMkvReader*, long long, long&);
 long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
 long long UnserializeUInt(IMkvReader*, long long pos, long long size);

 long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
-long UnserializeInt(IMkvReader*, long long pos, long len, long long& result);
+long UnserializeInt(IMkvReader*, long long pos, long long size,
+                    long long& result);

 long UnserializeString(IMkvReader*, long long pos, long long size, char*& str);

@ -123,7 +126,7 @@ class BlockEntry {
 public:
  virtual ~BlockEntry();

-  bool EOS() const;
+  bool EOS() const { return (GetKind() == kBlockEOS); }
  const Cluster* GetCluster() const;
  long GetIndex() const;
  virtual const Block* GetBlock() const = 0;
@ -386,6 +389,90 @@ class Track {
  ContentEncoding** content_encoding_entries_end_;
 };

+struct PrimaryChromaticity {
+  PrimaryChromaticity() : x(0), y(0) {}
+  ~PrimaryChromaticity() {}
+  static bool Parse(IMkvReader* reader, long long read_pos,
+                    long long value_size, bool is_x,
+                    PrimaryChromaticity** chromaticity);
+  float x;
+  float y;
+};
+
+struct MasteringMetadata {
+  static const float kValueNotPresent;
+
+  MasteringMetadata()
+      : r(NULL),
+        g(NULL),
+        b(NULL),
+        white_point(NULL),
+        luminance_max(kValueNotPresent),
+        luminance_min(kValueNotPresent) {}
+  ~MasteringMetadata() {
+    delete r;
+    delete g;
+    delete b;
+    delete white_point;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size,
+                    MasteringMetadata** mastering_metadata);
+
+  PrimaryChromaticity* r;
+  PrimaryChromaticity* g;
+  PrimaryChromaticity* b;
+  PrimaryChromaticity* white_point;
+  float luminance_max;
+  float luminance_min;
+};
+
+struct Colour {
+  static const long long kValueNotPresent;
+
+  // Unless otherwise noted all values assigned upon construction are the
+  // equivalent of unspecified/default.
+  Colour()
+      : matrix_coefficients(kValueNotPresent),
+        bits_per_channel(kValueNotPresent),
+        chroma_subsampling_horz(kValueNotPresent),
+        chroma_subsampling_vert(kValueNotPresent),
+        cb_subsampling_horz(kValueNotPresent),
+        cb_subsampling_vert(kValueNotPresent),
+        chroma_siting_horz(kValueNotPresent),
+        chroma_siting_vert(kValueNotPresent),
+        range(kValueNotPresent),
+        transfer_characteristics(kValueNotPresent),
+        primaries(kValueNotPresent),
+        max_cll(kValueNotPresent),
+        max_fall(kValueNotPresent),
+        mastering_metadata(NULL) {}
+  ~Colour() {
+    delete mastering_metadata;
+    mastering_metadata = NULL;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Colour** colour);
+
+  long long matrix_coefficients;
+  long long bits_per_channel;
+  long long chroma_subsampling_horz;
+  long long chroma_subsampling_vert;
+  long long cb_subsampling_horz;
+  long long cb_subsampling_vert;
+  long long chroma_siting_horz;
+  long long chroma_siting_vert;
+  long long range;
+  long long transfer_characteristics;
+  long long primaries;
+  long long max_cll;
+  long long max_fall;
+
+  MasteringMetadata* mastering_metadata;
+};
+
 class VideoTrack : public Track {
  VideoTrack(const VideoTrack&);
  VideoTrack& operator=(const VideoTrack&);
@ -393,20 +480,34 @@ class VideoTrack : public Track {
  VideoTrack(Segment*, long long element_start, long long element_size);

 public:
+  virtual ~VideoTrack();
  static long Parse(Segment*, const Info&, long long element_start,
                    long long element_size, VideoTrack*&);

  long long GetWidth() const;
  long long GetHeight() const;
+  long long GetDisplayWidth() const;
+  long long GetDisplayHeight() const;
+  long long GetDisplayUnit() const;
+  long long GetStereoMode() const;
  double GetFrameRate() const;

  bool VetEntry(const BlockEntry*) const;
  long Seek(long long time_ns, const BlockEntry*&) const;

+  Colour* GetColour() const;
+
 private:
  long long m_width;
  long long m_height;
+  long long m_display_width;
+  long long m_display_height;
+  long long m_display_unit;
+  long long m_stereo_mode;
+
  double m_rate;
+
+  Colour* m_colour;
 };

 class AudioTrack : public Track {
@ -582,6 +683,85 @@ class Chapters {
  int m_editions_count;
 };

+class Tags {
+  Tags(const Tags&);
+  Tags& operator=(const Tags&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tags(Segment*, long long payload_start, long long payload_size,
+       long long element_start, long long element_size);
+
+  ~Tags();
+
+  long Parse();
+
+  class Tag;
+  class SimpleTag;
+
+  class SimpleTag {
+    friend class Tag;
+    SimpleTag();
+    SimpleTag(const SimpleTag&);
+    ~SimpleTag();
+    SimpleTag& operator=(const SimpleTag&);
+
+   public:
+    const char* GetTagName() const;
+    const char* GetTagString() const;
+
+   private:
+    void Init();
+    void ShallowCopy(SimpleTag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_tag_name;
+    char* m_tag_string;
+  };
+
+  class Tag {
+    friend class Tags;
+    Tag();
+    Tag(const Tag&);
+    ~Tag();
+    Tag& operator=(const Tag&);
+
+   public:
+    int GetSimpleTagCount() const;
+    const SimpleTag* GetSimpleTag(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Tag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseSimpleTag(IMkvReader*, long long pos, long long size);
+    bool ExpandSimpleTagsArray();
+
+    SimpleTag* m_simple_tags;
+    int m_simple_tags_size;
+    int m_simple_tags_count;
+  };
+
+  int GetTagCount() const;
+  const Tag* GetTag(int index) const;
+
+ private:
+  long ParseTag(long long pos, long long size);
+  bool ExpandTagsArray();
+
+  Tag* m_tags;
+  int m_tags_size;
+  int m_tags_count;
+};
+
 class SegmentInfo {
  SegmentInfo(const SegmentInfo&);
  SegmentInfo& operator=(const SegmentInfo&);
@ -684,7 +864,7 @@ class CuePoint {
  long long m_element_start;
  long long m_element_size;

-  void Load(IMkvReader*);
+  bool Load(IMkvReader*);

  long long GetTimeCode() const;  // absolute but unscaled
  long long GetTime(const Segment*) const;  // absolute and scaled (ns units)
@ -697,7 +877,7 @@ class CuePoint {
    // reference = clusters containing req'd referenced blocks
    //  reftime = timecode of the referenced block

-    void Parse(IMkvReader*, long long, long long);
+    bool Parse(IMkvReader*, long long, long long);
  };

  const TrackPosition* Find(const Track*) const;
@ -730,14 +910,6 @@ class Cues {
      long long time_ns, const Track*, const CuePoint*&,
      const CuePoint::TrackPosition*&) const;

-#if 0
-    bool FindNext(  //upper_bound of time_ns
-        long long time_ns,
-        const Track*,
-        const CuePoint*&,
-        const CuePoint::TrackPosition*&) const;
-#endif
-
  const CuePoint* GetFirst() const;
  const CuePoint* GetLast() const;
  const CuePoint* GetNext(const CuePoint*) const;
@ -751,8 +923,8 @@ class Cues {
  bool DoneParsing() const;

 private:
-  void Init() const;
-  void PreloadCuePoint(long&, long long) const;
+  bool Init() const;
+  bool PreloadCuePoint(long&, long long) const;

  mutable CuePoint** m_cue_points;
  mutable long m_count;
@ -877,18 +1049,12 @@ class Segment {
  long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos,
                 long& size);

-#if 0
-    //This pair parses one cluster, but only changes the state of the
-    //segment object when the cluster is actually added to the index.
-    long ParseCluster(long long& cluster_pos, long long& new_pos) const;
-    bool AddCluster(long long cluster_pos, long long new_pos);
-#endif
-
  const SeekHead* GetSeekHead() const;
  const Tracks* GetTracks() const;
  const SegmentInfo* GetInfo() const;
  const Cues* GetCues() const;
  const Chapters* GetChapters() const;
+  const Tags* GetTags() const;

  long long GetDuration() const;

@ -914,6 +1080,7 @@ class Segment {
  Tracks* m_pTracks;
  Cues* m_pCues;
  Chapters* m_pChapters;
+  Tags* m_pTags;
  Cluster** m_clusters;
  long m_clusterCount;  // number of entries for which m_index >= 0
  long m_clusterPreloadCount;  // number of entries for which m_index < 0
@ -923,8 +1090,8 @@ class Segment {
  long DoLoadClusterUnknownSize(long long&, long&);
  long DoParseNext(const Cluster*&, long long&, long&);

-  void AppendCluster(Cluster*);
-  void PreloadCluster(Cluster*, ptrdiff_t);
+  bool AppendCluster(Cluster*);
+  bool PreloadCluster(Cluster*, ptrdiff_t);

  // void ParseSeekHead(long long pos, long long size);
  // void ParseSeekEntry(long long pos, long long size);
@ -933,7 +1100,7 @@ class Segment {
  const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&);
 };

-}  // end namespace mkvparser
+}  // namespace mkvparser

 inline long mkvparser::Segment::LoadCluster() {
  long long pos;
@ -942,4 +1109,4 @@ inline long mkvparser::Segment::LoadCluster() {
  return LoadCluster(pos, size);
 }

-#endif  // MKVPARSER_HPP
+#endif  // MKVPARSER_MKVPARSER_H_
--- a/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/third_party/libwebm/mkvparser/mkvreader.cc
@ -5,8 +5,7 @@
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
-
-#include "mkvreader.hpp"
+#include "mkvparser/mkvreader.h"

 #include <cassert>

@ -129,4 +128,4 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
  return 0;  // success
 }

-}  // end namespace mkvparser
+}  // namespace mkvparser
--- a/third_party/libwebm/mkvparser/mkvreader.h
+++ b/third_party/libwebm/mkvparser/mkvreader.h
@ -5,13 +5,13 @@
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVREADER_H_
+#define MKVPARSER_MKVREADER_H_

-#ifndef MKVREADER_HPP
-#define MKVREADER_HPP
-
-#include "mkvparser.hpp"
 #include <cstdio>

+#include "mkvparser/mkvparser.h"
+
 namespace mkvparser {

 class MkvReader : public IMkvReader {
@ -40,6 +40,6 @@ class MkvReader : public IMkvReader {
  bool reader_owns_file_;
 };

-}  // end namespace mkvparser
+}  // namespace mkvparser

-#endif  // MKVREADER_HPP
+#endif  // MKVPARSER_MKVREADER_H_
--- a/usage.dox
+++ b/usage.dox
@ -80,10 +80,13 @@


    The available initialization methods are:
-    \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
-    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
-    \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
-
+    \if encoder
+    \li #vpx_codec_enc_init (calls vpx_codec_enc_init_ver())
+    \li #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver())
+    \endif
+    \if decoder
+    \li #vpx_codec_dec_init (calls vpx_codec_dec_init_ver())
+    \endif


    \section usage_errors Error Handling
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@ -71,9 +71,34 @@ static int alloc_mi(VP9_COMMON *cm, int mi_size) {
  return 0;
 }

+#if CONFIG_PALETTE
+void vp9_free_palette_map(VP9_COMMON *cm) {
+  int i, j;
+  MODE_INFO *mi;
+
+  for (i = 0; i < cm->mi_rows; i++)
+    for (j = 0; j < cm->mi_cols; j++) {
+      mi = cm->mip + cm->mi_stride + 1 + (i * cm->mi_stride + j);
+      if (mi->mbmi.palette_color_map != NULL) {
+        vpx_free(mi->mbmi.palette_color_map);
+        mi->mbmi.palette_color_map = NULL;
+      }
+      if (mi->mbmi.palette_uv_color_map != NULL) {
+        vpx_free(mi->mbmi.palette_uv_color_map);
+        mi->mbmi.palette_uv_color_map = NULL;
+      }
+    }
+}
+#endif  // CONFIG_PALETTE
+
 static void free_mi(VP9_COMMON *cm) {
  int i;

+#if CONFIG_PALETTE
+  if (cm && cm->mip)
+    vp9_free_palette_map(cm);
+#endif  // CONFIG_PALETTE
+
  for (i = 0; i < 2; ++i) {
    vpx_free(cm->mip_array[i]);
    cm->mip_array[i] = NULL;
@ -97,6 +122,9 @@ void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
  }

  vp9_free_frame_buffer(&cm->post_proc_buffer);
+#if CONFIG_LOOP_POSTFILTER
+  vp9_free_frame_buffer(&cm->tmp_loop_buf);
+#endif
 }

 void vp9_free_context_buffers(VP9_COMMON *cm) {
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@ -37,4 +37,8 @@ void vp9_swap_mi_and_prev_mi(struct VP9Common *cm);
 }  // extern "C"
 #endif

+#if CONFIG_PALETTE
+void vp9_free_palette_map(struct VP9Common *cm);
+#endif  // CONFIG_PALETTE
+
 #endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -20,6 +20,7 @@
 #include "vp9/common/vp9_common_data.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_scale.h"

 #ifdef __cplusplus
@ -30,6 +31,34 @@ extern "C" {
 #define SKIP_CONTEXTS 3
 #define INTER_MODE_CONTEXTS 7

+#if CONFIG_SR_MODE
+#define SR_CONTEXTS 3  // number of enalbed tx_size for sr mode
+
+#define USE_POST_F 0  // 1: use post filters
+#define SR_USE_MULTI_F 0  // 1: choose from multiple post filters
+
+// SR_USFILTER_NUM_D: Number of 1D filters to choose in the post filter family
+// SR_USFILTER_NUM: Number of combined 2D filters to choose
+// If change this number, please change "idx_to_v","idx_to_h","hv_to_idx",
+// and the prob model ("vp9_sr_usfilter_tree", "default_sr_usfilter_probs")
+#define SR_USFILTER_NUM_D 4
+#define SR_USFILTER_NUM (SR_USFILTER_NUM_D * SR_USFILTER_NUM_D)
+
+#define SR_USFILTER_CONTEXTS 1
+// SR_USFILTER_CONTEXTS: Depends on the post filters of upper and left blocks
+#endif  // CONFIG_SR_MODE
+
+#if CONFIG_COPY_MODE
+#define COPY_MODE_CONTEXTS 5
+#endif  // CONFIG_COPY_MODE
+
+#if CONFIG_PALETTE
+#define PALETTE_BUF_SIZE 16
+#define PALETTE_MAX_SIZE 8
+#define PALETTE_DELTA_BIT 0
+#define PALETTE_COLOR_CONTEXTS 16
+#endif  // CONFIG_PALETTE
+
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2

@ -37,6 +66,31 @@ extern "C" {
 #define COMP_INTER_CONTEXTS 5
 #define REF_CONTEXTS 5

+#if CONFIG_MULTI_REF
+
+#define SINGLE_REFS 6
+#define COMP_REFS 5
+
+#else  // CONFIG_MULTI_REF
+#define SINGLE_REFS 3
+#define COMP_REFS 2
+#endif  // CONFIG_MULTI_REF
+
+#if CONFIG_NEW_QUANT
+#define QUANT_PROFILES 3
+#define Q_CTX_BASED_PROFILES 1
+
+#if QUANT_PROFILES > 1
+
+#define Q_THRESHOLD_MIN 0
+#define Q_THRESHOLD_MAX 1000
+
+static INLINE int switchable_dq_profile_used(int q_ctx, BLOCK_SIZE bsize) {
+  return ((bsize >= BLOCK_32X32) * q_ctx);
+}
+#endif  // QUANT_PROFILES > 1
+#endif  // CONFIG_NEW_QUANT
+
 typedef enum {
  PLANE_TYPE_Y  = 0,
  PLANE_TYPE_UV = 1,
@ -69,23 +123,95 @@ typedef enum {
  D207_PRED,       // Directional 207 deg = 180 + 27
  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
  TM_PRED,         // True-motion
+#if CONFIG_INTRABC
+  NEWDV,           // New displacement vector within the same frame buffer
+#endif  // CONFIG_INTRABC
  NEARESTMV,
  NEARMV,
  ZEROMV,
  NEWMV,
+#if CONFIG_NEW_INTER
+  NEW2MV,
+  NEAREST_NEARESTMV,
+  NEAREST_NEARMV,
+  NEAR_NEARESTMV,
+  NEAREST_NEWMV,
+  NEW_NEARESTMV,
+  NEAR_NEWMV,
+  NEW_NEARMV,
+  ZERO_ZEROMV,
+  NEW_NEWMV,
+#endif  // CONFIG_NEW_INTER
  MB_MODE_COUNT
 } PREDICTION_MODE;

+#if CONFIG_COPY_MODE
+typedef enum {
+  NOREF,
+  REF0,
+  REF1,
+  REF2,
+  COPY_MODE_COUNT
+} COPY_MODE;
+#endif  // CONFIG_COPY_MODE
+
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+#if CONFIG_NEW_INTER
+  return mode >= NEARESTMV && mode <= NEW2MV;
+#else
  return mode >= NEARESTMV && mode <= NEWMV;
+#endif  // CONFIG_NEW_INTER
 }

-#define INTRA_MODES (TM_PRED + 1)
+#if CONFIG_NEW_INTER
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+  return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+}
+#endif  // CONFIG_NEW_INTER

+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+#if CONFIG_NEW_INTER
+  return (mode == NEWMV ||
+          mode == NEW2MV ||
+          mode == NEW_NEWMV ||
+          mode == NEAREST_NEWMV ||
+          mode == NEW_NEARESTMV ||
+          mode == NEAR_NEWMV ||
+          mode == NEW_NEARMV);
+#else
+  return (mode == NEWMV);
+#endif  // CONFIG_NEW_INTER
+}
+
+#if CONFIG_INTRABC
+static INLINE int is_intrabc_mode(PREDICTION_MODE mode) {
+  return mode == NEWDV;
+}
+#endif  // CONFIG_INTRABC
+
+#define INTRA_MODES (TM_PRED + 1)
+#if CONFIG_NEW_INTER
+#define INTER_MODES (1 + NEW2MV - NEARESTMV)
+#else
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
+#endif  // CONFIG_NEW_INTER

 #define INTER_OFFSET(mode) ((mode) - NEARESTMV)

+#if CONFIG_NEW_INTER
+
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+
+#define INTER_COMPOUND_OFFSET(mode) ((mode) - NEAREST_NEARESTMV)
+
+#endif  // CONFIG_NEW_INTER
+
+#if CONFIG_TX64X64
+#define MAXTXLEN 64
+#else
+#define MAXTXLEN 32
+#endif
+
 /* For keyframes, intra block modes are predicted by the (already decoded)
   modes for the Y blocks to the left and above us; for interframes, there
   is a single probability table. */
@ -93,6 +219,9 @@ static INLINE int is_inter_mode(PREDICTION_MODE mode) {
 typedef struct {
  PREDICTION_MODE as_mode;
  int_mv as_mv[2];  // first, second inter predictor motion vectors
+#if CONFIG_NEW_INTER
+  int_mv ref_mv[2];
+#endif  // CONFIG_NEW_INTER
 } b_mode_info;

 // Note that the rate-distortion optimization loop, bit-stream writer, and
@ -102,9 +231,17 @@ typedef enum {
  NONE = -1,
  INTRA_FRAME = 0,
  LAST_FRAME = 1,
+#if CONFIG_MULTI_REF
+  LAST2_FRAME = 2,
+  LAST3_FRAME = 3,
+  LAST4_FRAME = 4,
+  GOLDEN_FRAME = 5,
+  ALTREF_FRAME = 6,
+#else  // CONFIG_MULTI_REF
  GOLDEN_FRAME = 2,
  ALTREF_FRAME = 3,
-  MAX_REF_FRAMES = 4
+#endif  // CONFIG_MULTI_REF
+  MAX_REF_FRAMES
 } MV_REFERENCE_FRAME;

 // This structure now relates to 8x8 block regions.
@ -112,6 +249,13 @@ typedef struct {
  // Common for both INTER and INTRA blocks
  BLOCK_SIZE sb_type;
  PREDICTION_MODE mode;
+#if CONFIG_FILTERINTRA
+  int filterbit, uv_filterbit;
+#endif
+#if CONFIG_SR_MODE
+  int sr;
+  int us_filter_idx;
+#endif  // CONFIG_SR_MODE
  TX_SIZE tx_size;
  int8_t skip;
  int8_t segment_id;
@ -126,11 +270,62 @@ typedef struct {
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  uint8_t mode_context[MAX_REF_FRAMES];
  INTERP_FILTER interp_filter;
+
+#if CONFIG_EXT_TX
+  EXT_TX_TYPE ext_txfrm;
+#endif
+#if CONFIG_TX_SKIP
+  int tx_skip[PLANE_TYPES];
+  int tx_skip_shift;
+#endif  // CONFIG_TX_SKIP
+#if CONFIG_COPY_MODE
+  COPY_MODE copy_mode;
+  int inter_ref_count;
+#endif  // CONFIG_COPY_MODE
+#if CONFIG_INTERINTRA
+  PREDICTION_MODE interintra_mode;
+  PREDICTION_MODE interintra_uv_mode;
+#if CONFIG_WEDGE_PARTITION
+  int use_wedge_interintra;
+  int interintra_wedge_index;
+  int interintra_uv_wedge_index;
+#endif  // CONFIG_WEDGE_PARTITION
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  int use_wedge_interinter;
+  int interinter_wedge_index;
+#endif  // CONFIG_WEDGE_PARTITION
+#if CONFIG_PALETTE
+  int palette_enabled[2];
+  int palette_size[2];
+  int palette_indexed_size;
+  int palette_literal_size;
+  int current_palette_size;
+  int palette_delta_bitdepth;
+  uint8_t palette_indexed_colors[PALETTE_MAX_SIZE];
+  int8_t palette_color_delta[PALETTE_MAX_SIZE];
+  uint8_t *palette_color_map;
+  uint8_t *palette_uv_color_map;
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+  uint16_t palette_literal_colors[PALETTE_MAX_SIZE];
+#else
+  uint8_t palette_colors[3 * PALETTE_MAX_SIZE];
+  uint8_t palette_literal_colors[PALETTE_MAX_SIZE];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_PALETTE
+#if CONFIG_NEW_QUANT
+  int dq_off_index;
+  int send_dq_bit;
+#endif  // CONFIG_NEW_QUANT
 } MB_MODE_INFO;

 typedef struct MODE_INFO {
  struct MODE_INFO *src_mi;
  MB_MODE_INFO mbmi;
+#if CONFIG_FILTERINTRA
+  int b_filter_info[4];
+#endif
  b_mode_info bmi[4];
 } MODE_INFO;

@ -139,6 +334,21 @@ static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
                                      : mi->mbmi.mode;
 }

+#if CONFIG_FILTERINTRA
+static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
+#if CONFIG_INTRABC
+  return !is_intrabc_mode(mode);
+#else
+  (void)mode;
+  return 1;
+#endif  // CONFIG_INTRABC
+}
+
+static INLINE int is_filter_enabled(TX_SIZE txsize) {
+  return (txsize < TX_SIZES);
+}
+#endif
+
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
  return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@ -160,6 +370,9 @@ enum mv_precision {

 struct buf_2d {
  uint8_t *buf;
+  uint8_t *buf0;
+  int width;
+  int height;
  int stride;
 };

@ -171,8 +384,20 @@ struct macroblockd_plane {
  struct buf_2d dst;
  struct buf_2d pre[2];
  const int16_t *dequant;
+#if CONFIG_NEW_QUANT
+  const dequant_val_type_nuq* dequant_val_nuq[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
+#if CONFIG_TX_SKIP
+  const int16_t *dequant_pxd;
+#if CONFIG_NEW_QUANT
+  const dequant_val_type_nuq* dequant_val_nuq_pxd[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_TX_SKIP
  ENTROPY_CONTEXT *above_context;
  ENTROPY_CONTEXT *left_context;
+#if CONFIG_PALETTE
+  uint8_t *color_index_map;
+#endif
 };

 #define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
@ -207,26 +432,39 @@ typedef struct macroblockd {
  /* pointer to current frame */
  const YV12_BUFFER_CONFIG *cur_buf;

+  // The size of mc_buf contains a x2 for each dimension because the image may
+  // be no less than 2x smaller
  /* mc buffer */
-  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
-
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[(CODING_UNIT_SIZE + 16) * 2 *
+                                      (CODING_UNIT_SIZE + 16) * 2]);
 #if CONFIG_VP9_HIGHBITDEPTH
  /* Bit depth: 8, 10, 12 */
  int bd;
-  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[(CODING_UNIT_SIZE + 16) * 2 *
+                                            (CODING_UNIT_SIZE + 16) * 2]);
 #endif

  int lossless;

  int corrupted;

-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][CODING_UNIT_SIZE *
+                                                        CODING_UNIT_SIZE]);
+#if CONFIG_PALETTE
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][CODING_UNIT_SIZE *
+                                                  CODING_UNIT_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, palette_map_buffer[CODING_UNIT_SIZE *
+                                                  CODING_UNIT_SIZE]);
+#endif  // CONFIG_PALETTE

  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MI_BLOCK_SIZE];

  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
+  PARTITION_CONTEXT left_seg_context[MI_BLOCK_SIZE];
+#if CONFIG_GLOBAL_MOTION
+  Global_Motion_Params (*global_motion)[MAX_GLOBAL_MOTION_MODELS];
+#endif  // CONFIG_GLOBAL_MOTION
 } MACROBLOCKD;

 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
@ -234,25 +472,204 @@ static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
  return subsize_lookup[partition][bsize];
 }

+#if CONFIG_EXT_PARTITION
+static INLINE PARTITION_TYPE get_partition(const MODE_INFO *const mi,
+                                           int mi_stride, int mi_rows,
+                                           int mi_cols, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) / 4;
+  MODE_INFO *m = mi[mi_row * mi_stride + mi_col].src_mi;
+  PARTITION_TYPE partition = partition_lookup[bsl][m->mbmi.sb_type];
+  if (partition != PARTITION_NONE && bsize > BLOCK_8X8 &&
+      mi_row + bs < mi_rows && mi_col + bs < mi_cols) {
+    BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+    BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+    MODE_INFO *m_right = mi[mi_row * mi_stride + mi_col + bs].src_mi;
+    MODE_INFO *m_below = mi[(mi_row + bs) * mi_stride + mi_col].src_mi;
+    if (m->mbmi.sb_type == h) {
+      return m_below->mbmi.sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+    } else if (m_below->mbmi.sb_type == h) {
+      return m->mbmi.sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_A;
+    } else if (m->mbmi.sb_type == v) {
+      return m_right->mbmi.sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+    } else if (m_right->mbmi.sb_type == v) {
+      return m->mbmi.sb_type == v ? PARTITION_VERT : PARTITION_VERT_A;
+    } else {
+      return PARTITION_SPLIT;
+    }
+  }
+  return partition;
+}
+#endif
+
 extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];

+#if CONFIG_SUPERTX
+
+#define PARTITION_SUPERTX_CONTEXTS 2
+
+#if CONFIG_TX64X64
+#define MAX_SUPERTX_BLOCK_SIZE BLOCK_64X64
+#else
+#define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32
+#endif  // CONFIG_TX64X64
+
+static INLINE TX_SIZE bsize_to_tx_size(BLOCK_SIZE bsize) {
+  const TX_SIZE bsize_to_tx_size_lookup[BLOCK_SIZES] = {
+    TX_4X4, TX_4X4, TX_4X4,
+    TX_8X8, TX_8X8, TX_8X8,
+    TX_16X16, TX_16X16, TX_16X16,
+    TX_32X32, TX_32X32, TX_32X32,
+#if CONFIG_TX64X64
+    TX_64X64
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    , TX_64X64, TX_64X64, TX_64X64
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
+#else
+    TX_32X32
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    , TX_32X32, TX_32X32, TX_32X32
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
+#endif  // CONFIG_TX64X64
+  };
+  return bsize_to_tx_size_lookup[bsize];
+}
+
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  return (int)mbmi->tx_size >
+         MIN(b_width_log2_lookup[mbmi->sb_type],
+             b_height_log2_lookup[mbmi->sb_type]);
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_TX
+#if CONFIG_WAVELETS
+#define GET_EXT_TX_TYPES(tx_size) \
+    ((tx_size) >= TX_32X32 ? EXT_TX_TYPES_LARGE : EXT_TX_TYPES)
+#define GET_EXT_TX_TREE(tx_size) \
+    ((tx_size) >= TX_32X32 ? vp9_ext_tx_large_tree : vp9_ext_tx_tree)
+#define GET_EXT_TX_ENCODINGS(tx_size) \
+    ((tx_size) >= TX_32X32 ? ext_tx_large_encodings : ext_tx_encodings)
+#else
+#define GET_EXT_TX_TYPES(tx_size) \
+    ((tx_size) >= TX_32X32 ? 1 : EXT_TX_TYPES)
+#define GET_EXT_TX_TREE(tx_size) \
+    ((tx_size) >= TX_32X32 ? NULL : vp9_ext_tx_tree)
+#define GET_EXT_TX_ENCODINGS(tx_size) \
+    ((tx_size) >= TX_32X32 ? NULL : ext_tx_encodings)
+#endif  // CONFIG_WAVELETS
+
+static TX_TYPE ext_tx_to_txtype[EXT_TX_TYPES] = {
+  DCT_DCT,
+  ADST_DCT,
+  DCT_ADST,
+  ADST_ADST,
+  FLIPADST_DCT,
+  DCT_FLIPADST,
+  FLIPADST_FLIPADST,
+  ADST_FLIPADST,
+  FLIPADST_ADST,
+  DST_DST,
+  DST_DCT,
+  DCT_DST,
+  DST_ADST,
+  ADST_DST,
+  DST_FLIPADST,
+  FLIPADST_DST,
+};
+
+static INLINE int is_dst_used(TX_TYPE tx_type) {
+  return (tx_type == DST_DST ||
+          tx_type == DST_DCT || tx_type == DCT_DST ||
+          tx_type == DST_ADST || tx_type == ADST_DST ||
+          tx_type == DST_FLIPADST || tx_type == FLIPADST_DST);
+}
+
+#if CONFIG_WAVELETS
+static TX_TYPE ext_tx_to_txtype_large[EXT_TX_TYPES_LARGE] = {
+  DCT_DCT,
+  WAVELET1_DCT_DCT
+};
+#endif  // CONFIG_WAVELETS
+#endif  // CONFIG_EXT_TX
+
+static INLINE TX_TYPE get_tx_type_large(PLANE_TYPE plane_type,
+                                        const MACROBLOCKD *xd) {
+#if CONFIG_EXT_TX && CONFIG_WAVELETS
+  const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  if (plane_type != PLANE_TYPE_Y || xd->lossless)
+      return DCT_DCT;
+
+  if (is_inter_block(mbmi)) {
+    return ext_tx_to_txtype_large[mbmi->ext_txfrm];
+  }
+#endif  // CONFIG_EXT_TX  && CONFIG_WAVELETS
+  (void) plane_type;
+  (void) xd;
+  return DCT_DCT;
+}
+
 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
                                  const MACROBLOCKD *xd) {
  const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  (void) plane_type;

-  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
+#if CONFIG_EXT_TX
+  if (xd->lossless)
+      return DCT_DCT;
+
+  if (is_inter_block(mbmi)) {
+    return ext_tx_to_txtype[mbmi->ext_txfrm];
+  }
+#if CONFIG_INTRABC
+  if (is_intrabc_mode(mbmi->mode))
    return DCT_DCT;
+#endif  // CONFIG_INTRABC
+  return intra_mode_to_tx_type_lookup[plane_type == PLANE_TYPE_Y ?
+      mbmi->mode : mbmi->uv_mode];
+#else   // CONFIG_EXT_TX
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
+    return DCT_DCT;
+#if CONFIG_INTRABC
+  if (is_intrabc_mode(mbmi->mode))
+    return DCT_DCT;
+#endif  // CONFIG_INTRABC
  return intra_mode_to_tx_type_lookup[mbmi->mode];
+#endif  // CONFIG_EXT_TX
 }

 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd, int ib) {
  const MODE_INFO *const mi = xd->mi[0].src_mi;
+  PREDICTION_MODE mode;
+  (void) plane_type;

+#if CONFIG_EXT_TX
+  if (xd->lossless)
+      return DCT_DCT;
+
+  if (is_inter_block(&mi->mbmi)) {
+    return ext_tx_to_txtype[mi->mbmi.ext_txfrm];
+  }
+  mode = get_y_mode(mi, ib);
+#if CONFIG_INTRABC
+  if (is_intrabc_mode(mode))
+    return DCT_DCT;
+#endif  // CONFIG_INTRABC
+  return intra_mode_to_tx_type_lookup[plane_type == PLANE_TYPE_Y ?
+      mode : mi->mbmi.uv_mode];
+#else   // CONFIG_EXT_TX
  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
    return DCT_DCT;
+  mode = get_y_mode(mi, ib);
+#if CONFIG_INTRABC
+  if (is_intrabc_mode(mode))
+    return DCT_DCT;
+#endif  // CONFIG_INTRABC

-  return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
+  return intra_mode_to_tx_type_lookup[mode];
+#endif  // CONFIG_EXT_TX
 }

 void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
@ -269,8 +686,18 @@ static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,

 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                     const struct macroblockd_plane *pd) {
+#if CONFIG_SUPERTX
+  if (!supertx_enabled(mbmi)) {
+    return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
+                               pd->subsampling_y);
+  } else {
+    return uvsupertx_size_lookup[mbmi->tx_size][pd->subsampling_x]
+                                               [pd->subsampling_y];
+  }
+#else
  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
                             pd->subsampling_y);
+#endif  // CONFIG_SUPERTX
 }

 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
@ -307,6 +734,46 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                      int aoff, int loff);

+#if CONFIG_INTERINTRA
+static INLINE int is_interintra_allowed(BLOCK_SIZE sb_type) {
+  return ((sb_type >= BLOCK_8X8) && (sb_type < BLOCK_64X64));
+}
+#endif  // CONFIG_INTERINTRA
+
+#if CONFIG_WEDGE_PARTITION
+#define WEDGE_BITS_SML   3
+#define WEDGE_BITS_MED   4
+#define WEDGE_BITS_BIG   5
+#define WEDGE_NONE      -1
+
+#define WEDGE_WEIGHT_BITS 6
+
+static INLINE int get_wedge_bits(BLOCK_SIZE sb_type) {
+  if (sb_type < BLOCK_8X8)
+    return 0;
+  if (sb_type <= BLOCK_8X8)
+    return WEDGE_BITS_SML;
+  else if (sb_type <= BLOCK_32X32)
+    return WEDGE_BITS_MED;
+  else
+    return WEDGE_BITS_BIG;
+}
+#endif  // CONFIG_WEDGE_PARTITION
+
+#if CONFIG_NEW_QUANT && CONFIG_TX_SKIP
+static INLINE int is_rect_quant_used(const MB_MODE_INFO *mbmi,
+                                     int plane) {
+  return
+      mbmi->tx_skip[plane != 0] &&
+      ((plane == 0 && (mbmi->mode == V_PRED ||
+                       mbmi->mode == H_PRED ||
+                       mbmi->mode == TM_PRED)) ||
+       (plane != 0 && (mbmi->uv_mode == V_PRED ||
+                       mbmi->uv_mode == H_PRED ||
+                       mbmi->uv_mode == TM_PRED)));
+}
+#endif  // CONFIG_NEW_QUANT && CONFIG_TX_SKIP
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@ -16,8 +16,9 @@
 #include <assert.h>

 #include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 #include "vp9/common/vp9_systemdependent.h"

 #ifdef __cplusplus
@ -27,12 +28,6 @@ extern "C" {
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))

-#define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
-
-#define ALIGN_POWER_OF_TWO(value, n) \
-    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
-
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src) {            \
    assert(sizeof(dest) == sizeof(src)); \
@ -83,9 +78,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
 typedef int64_t tran_high_t;
 typedef int32_t tran_low_t;

-#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
-#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 ))
-
 #else

 // Note:
@ -118,6 +110,17 @@ typedef int16_t tran_low_t;

 #define VP9_FRAME_MARKER 0x2

+static INLINE int get_unsigned_bits_gen(unsigned int num_values) {
+  int cat = 0;
+  if (num_values <= 1)
+    return 0;
+  num_values--;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
+  }
+  return cat;
+}

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@ -12,27 +12,63 @@

 // Log 2 conversion lookup tables for block width and height
 const int b_width_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
+  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    4, 5, 5
+#endif
+  };
 const int b_height_log2_lookup[BLOCK_SIZES] =
-  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
+  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    5, 4, 5
+#endif
+};
 const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
+  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    16, 32, 32
+#endif
+};
 const int num_4x4_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
+  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    32, 16, 32
+#endif
+};
 // Log 2 conversion lookup tables for modeinfo width and height
 const int mi_width_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    3, 4, 4
+#endif
+};
 const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
+  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    8, 16, 16
+#endif
+};
 const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    16, 8, 16
+#endif
+};

 // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
 const int size_group_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    3, 3, 3
+#endif
+};

 const int num_pels_log2_lookup[BLOCK_SIZES] =
-  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    13, 13, 14
+#endif
+};

 const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
  {  // 4X4
@ -41,34 +77,141 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID
+    PARTITION_INVALID,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    // 64x128,128x64,128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
  }, {  // 8X8
    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    // 64x128,128x64,128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
  }, {  // 16X16
    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
+    PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    // 64x128,128x64,128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
  }, {  // 32X32
    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
+    PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    // 64x128,128x64,128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
  }, {  // 64X64
    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
-    PARTITION_NONE
+    PARTITION_NONE,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    // 64x128,128x64,128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif
+  },
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  {  // 128x128
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 64x128,128x64,128x128
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE
  }
+#endif
 };

+#if CONFIG_EXT_PARTITION
+const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES] = {
+  {     // PARTITION_NONE
+    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+    BLOCK_64X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_64X128, BLOCK_128X64, BLOCK_128X128,
+#endif
+  }, {  // PARTITION_HORZ
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif
+  }, {  // PARTITION_VERT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif
+  }, {  // PARTITION_SPLIT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X32,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+#endif
+  }, {  // PARTITION_HORZ_A
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif
+  }, {  // PARTITION_HORZ_B
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif
+  }, {  // PARTITION_VERT_A
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif
+  }, {  // PARTITION_VERT_B
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif
+  }
+};
+#else
 const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
  {     // PARTITION_NONE
    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
@ -76,47 +219,79 @@ const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
    BLOCK_64X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_64X128, BLOCK_128X64, BLOCK_128X128,
+#endif
  }, {  // PARTITION_HORZ
    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_64X32,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif
  }, {  // PARTITION_VERT
    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_32X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif
  }, {  // PARTITION_SPLIT
    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_32X32,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+#endif
  }
 };
+#endif  // CONFIG_EXT_PARTITION

 const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
  TX_4X4,   TX_4X4,   TX_4X4,
  TX_8X8,   TX_8X8,   TX_8X8,
  TX_16X16, TX_16X16, TX_16X16,
-  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+  TX_32X32, TX_32X32, TX_32X32,
+#if CONFIG_TX64X64
+  TX_64X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  TX_64X64, TX_64X64, TX_64X64,
+#endif
+#else
+  TX_32X32,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  TX_32X32, TX_32X32, TX_32X32,
+#endif
+#endif  // CONFIG_TX64X64
 };

 const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
-    BLOCK_4X4,  // TX_4X4
-    BLOCK_8X8,  // TX_8X8
+    BLOCK_4X4,    // TX_4X4
+    BLOCK_8X8,    // TX_8X8
    BLOCK_16X16,  // TX_16X16
    BLOCK_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+    BLOCK_64X64,  // TX_64X64
+#endif
 };

 const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
-  TX_4X4,  // ONLY_4X4
-  TX_8X8,  // ALLOW_8X8
+  TX_4X4,    // ONLY_4X4
+  TX_8X8,    // ALLOW_8X8
  TX_16X16,  // ALLOW_16X16
  TX_32X32,  // ALLOW_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // ALLOW_64X64
+  TX_64X64,  // TX_MODE_SELECT
+#else
  TX_32X32,  // TX_MODE_SELECT
+#endif
 };

 const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
@ -135,6 +310,11 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
  {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
  {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  {{BLOCK_64X128, BLOCK_64X64},   {BLOCK_INVALID, BLOCK_32X64}},
+  {{BLOCK_128X64, BLOCK_INVALID}, {BLOCK_64X64,   BLOCK_64X32}},
+  {{BLOCK_128X128, BLOCK_128X64}, {BLOCK_64X128,  BLOCK_64X64}},
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
 };

 // Generates 4 bit field in which each bit set to 1 represents
@ -144,6 +324,24 @@ const struct {
  PARTITION_CONTEXT above;
  PARTITION_CONTEXT left;
 } partition_context_lookup[BLOCK_SIZES]= {
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  {31, 31},  // 4X4   - {0b11111, 0b11111}
+  {31, 30},  // 4X8   - {0b11111, 0b11110}
+  {30, 31},  // 8X4   - {0b11110, 0b11111}
+  {30, 30},  // 8X8   - {0b11110, 0b11110}
+  {30, 28},  // 8X16  - {0b11110, 0b11100}
+  {28, 30},  // 16X8  - {0b11100, 0b11110}
+  {28, 28},  // 16X16 - {0b11100, 0b11100}
+  {28, 24},  // 16X32 - {0b11100, 0b11000}
+  {24, 28},  // 32X16 - {0b11000, 0b11100}
+  {24, 24},  // 32X32 - {0b11000, 0b11000}
+  {24, 16},  // 32X64 - {0b11000, 0b10000}
+  {16, 24},  // 64X32 - {0b10000, 0b11000}
+  {16, 16},  // 64X64 - {0b10000, 0b10000}
+  {16,  0},  // 64X128- {0b10000, 0b00000}
+  {0,  16},  // 128X64- {0b00000, 0b10000}
+  {0,  0 },  // 128X128-{0b00000, 0b00000}
+#else
  {15, 15},  // 4X4   - {0b1111, 0b1111}
  {15, 14},  // 4X8   - {0b1111, 0b1110}
  {14, 15},  // 8X4   - {0b1110, 0b1111}
@ -157,4 +355,31 @@ const struct {
  {8,  0 },  // 32X64 - {0b1000, 0b0000}
  {0,  8 },  // 64X32 - {0b0000, 0b1000}
  {0,  0 },  // 64X64 - {0b0000, 0b0000}
+#endif
 };
+
+#if CONFIG_SUPERTX
+const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
+//  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
+//  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
+  {{TX_4X4,   TX_4X4},   {TX_4X4,   TX_4X4}},
+  {{TX_8X8,   TX_4X4},   {TX_4X4,   TX_4X4}},
+  {{TX_16X16, TX_8X8},   {TX_8X8,   TX_8X8}},
+  {{TX_32X32, TX_16X16}, {TX_16X16, TX_16X16}},
+#if CONFIG_TX64X64
+  {{TX_64X64, TX_32X32}, {TX_32X32, TX_32X32}},
+#endif  // CONFIG_TX64X64
+};
+
+#if CONFIG_EXT_PARTITION
+const int partition_supertx_context_lookup[EXT_PARTITION_TYPES] = {
+  -1, 0, 0, 1, 0, 0, 0, 0
+};
+
+#else
+const int partition_supertx_context_lookup[PARTITION_TYPES] = {
+  -1, 0, 0, 1
+};
+#endif
+
+#endif  // CONFIG_SUPERTX
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@ -27,11 +27,23 @@ extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES];
 extern const int size_group_lookup[BLOCK_SIZES];
 extern const int num_pels_log2_lookup[BLOCK_SIZES];
 extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
+#if CONFIG_EXT_PARTITION
+extern const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES];
+#else
 extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
+#endif
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_SUPERTX
+extern const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2];
+#if CONFIG_EXT_PARTITION
+extern const int partition_supertx_context_lookup[EXT_PARTITION_TYPES];
+#else
+extern const int partition_supertx_context_lookup[PARTITION_TYPES];
+#endif
+#endif

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@ -22,6 +22,7 @@ extern "C" {
 #endif

 #define DIFF_UPDATE_PROB 252
+#define GROUP_DIFF_UPDATE_PROB 252

 // Coefficient token alphabet
 #define ZERO_TOKEN      0   // 0     Extra Bits 0+0
@ -50,13 +51,35 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
 #define CAT5_MIN_VAL   35
 #define CAT6_MIN_VAL   67

+#if CONFIG_TX64X64
+#define DCT_MAX_VALUE              32768
+#define NUM_CAT6_BITS                 15
+#else
+#define DCT_MAX_VALUE              16384
+#define NUM_CAT6_BITS                 14
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_TX64X64
+#define DCT_MAX_VALUE_HIGH10      131072
+#define DCT_MAX_VALUE_HIGH12      524288
+#define NUM_CAT6_BITS_HIGH10          17
+#define NUM_CAT6_BITS_HIGH12          19
+#else
+#define DCT_MAX_VALUE_HIGH10       65536
+#define DCT_MAX_VALUE_HIGH12      262144
+#define NUM_CAT6_BITS_HIGH10          16
+#define NUM_CAT6_BITS_HIGH12          18
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // Extra bit probabilities.
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob[1]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob[2]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob[3]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob[4]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob[5]);
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[14]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[NUM_CAT6_BITS]);

 #if CONFIG_VP9_HIGHBITDEPTH
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high10[1]);
@ -64,13 +87,15 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high10[2]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high10[3]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high10[4]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high10[5]);
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high10[16]);
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_cat6_prob_high10[NUM_CAT6_BITS_HIGH10]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high12[1]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high12[2]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high12[3]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high12[4]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high12[5]);
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high12[18]);
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_cat6_prob_high12[NUM_CAT6_BITS_HIGH12]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #define EOB_MODEL_TOKEN 3
@ -90,18 +115,18 @@ extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
 extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-#define DCT_MAX_VALUE           16384
-#if CONFIG_VP9_HIGHBITDEPTH
-#define DCT_MAX_VALUE_HIGH10    65536
-#define DCT_MAX_VALUE_HIGH12   262144
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 /* Coefficients are predicted via a 3-dimensional probability table. */

 #define REF_TYPES 2  // intra=0, inter=1

 /* Middle dimension reflects the coefficient position within the transform. */
+#if CONFIG_TX_SKIP
+#define FOR_SCREEN_CONTENT 0
+#define COEF_BANDS 7
+#define TX_SKIP_COEFF_BAND 6
+#else
 #define COEF_BANDS 6
+#endif  // CONFIG_TX_SKIP

 /* Inside dimension is measure of nearby complexity, that reflects the energy
   of nearby coefficients are nonzero.  For the first coefficient (DC, unless
@ -128,6 +153,10 @@ typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
                                    [ENTROPY_TOKENS];
 typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
                                    [ENTROPY_NODES][2];
+#if CONFIG_TX_SKIP
+typedef unsigned int vp9_coeff_stats_pxd[REF_TYPES][COEFF_CONTEXTS]
+                                        [ENTROPY_NODES][2];
+#endif  // CONFIG_TX_SKIP

 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
 #define MODULUS_PARAM               13  /* Modulus parameter */
@ -153,8 +182,19 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21

-DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+#if CONFIG_TX64X64
+#define MAX_NUM_COEFS 4096
+#else
+#define MAX_NUM_COEFS 1024
+#endif
+
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_coefband_trans_8x8plus[MAX_NUM_COEFS]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);
+#if CONFIG_TX_SKIP
+DECLARE_ALIGNED(16, extern uint8_t,
+                vp9_coefband_tx_skip[MAX_NUM_COEFS]);
+#endif  // CONFIG_TX_SKIP

 static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
@ -183,6 +223,12 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]

 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);

+#if CONFIG_TX_SKIP
+typedef vp9_prob vp9_coeff_probs_pxd[REF_TYPES][COEFF_CONTEXTS][ENTROPY_NODES];
+typedef unsigned int vp9_coeff_counts_pxd[REF_TYPES][COEFF_CONTEXTS]
+                                                     [ENTROPY_TOKENS];
+#endif  // CONFIG_TX_SKIP
+
 static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                                      const ENTROPY_CONTEXT *l) {
  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
@ -204,24 +250,68 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
      above_ec = !!*(const uint64_t *)a;
      left_ec  = !!*(const uint64_t *)l;
      break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
+      break;
+#endif
    default:
      assert(0 && "Invalid transform size.");
      break;
  }
-
  return combine_entropy_contexts(above_ec, left_ec);
 }

+static INLINE int get_entropy_context_sb(const MACROBLOCKD *xd,
+                                         BLOCK_SIZE bsize) {
+  const struct macroblockd_plane *pd = &xd->plane[0];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  return get_entropy_context(max_tx_size, pd->above_context, pd->left_context);
+}
+
 static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                                         PLANE_TYPE type, int block_idx) {
  const MODE_INFO *const mi = xd->mi[0].src_mi;

-  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+#if CONFIG_TX_SKIP
+  if (mi->mbmi.tx_skip[type])
+    return &vp9_default_scan_orders_pxd[tx_size];
+#endif  // CONFIG_TX_SKIP
+
+#if CONFIG_EXT_TX
+
+  if (xd->lossless
+#if CONFIG_INTRABC
+      || is_intrabc_mode(mi->mbmi.mode)
+#endif
+      ) {
+    return &vp9_default_scan_orders[tx_size];
+  } else if (is_inter_block(&mi->mbmi)) {
+    TX_TYPE tx_type = (tx_size <= TX_16X16) ?
+        get_tx_type_4x4(type, xd, block_idx) : get_tx_type_large(type, xd);
+    return &vp9_inter_scan_orders[tx_size][tx_type];
+  } else {
+    const PREDICTION_MODE mode =
+        (type == PLANE_TYPE_Y ? get_y_mode(mi, block_idx) : mi->mbmi.uv_mode);
+    return &vp9_intra_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+  }
+
+#else   // CONFIG_EXT_TX
+
+  if (type != PLANE_TYPE_Y || xd->lossless
+#if CONFIG_INTRABC
+      || is_intrabc_mode(mi->mbmi.mode)
+#endif
+      ) {
+    return &vp9_default_scan_orders[tx_size];
+  } else if (is_inter_block(&mi->mbmi)) {
    return &vp9_default_scan_orders[tx_size];
  } else {
    const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
-    return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+    return &vp9_intra_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
  }
+#endif  // CONFIG_EXT_TX
 }

 #ifdef __cplusplus
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@ -19,75 +19,267 @@
 extern "C" {
 #endif

+#if CONFIG_INTRABC
+#define INTRABC_PROB 192
+#endif  // CONFIG_INTRABC
+
 #define TX_SIZE_CONTEXTS 2

 struct VP9Common;

 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+#if CONFIG_TX64X64
+  vp9_prob p64x64[TX_SIZE_CONTEXTS][4];
+#endif
+  vp9_prob p32x32[TX_SIZE_CONTEXTS][3];
+  vp9_prob p16x16[TX_SIZE_CONTEXTS][2];
+  vp9_prob p8x8[TX_SIZE_CONTEXTS][1];
 };

 struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+#if CONFIG_TX64X64
+  // counter for entropy coding
+  // (In some case, tx_size may be not written to the bitstream)
+  unsigned int p64x64[TX_SIZE_CONTEXTS][5];
+#endif
+  unsigned int p32x32[TX_SIZE_CONTEXTS][4];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][3];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][2];
+#if CONFIG_SR_MODE
+  // counter for tx_size actual usage (to determine ALLOW_16X16, etc.)
+#if CONFIG_TX64X64
+  unsigned int real_p64x64[TX_SIZE_CONTEXTS][5];
+#endif  // CONFIG_TX64X64
+  unsigned int real_p32x32[TX_SIZE_CONTEXTS][4];
+  unsigned int real_p16x16[TX_SIZE_CONTEXTS][3];
+  unsigned int real_p8x8[TX_SIZE_CONTEXTS][2];
+#endif  // CONFIG_SR_MODE
 };

 typedef struct frame_contexts {
  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_EXT_PARTITION
+  vp9_prob partition_prob[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1];
+#else
  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+#endif
  vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS - 1];
  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+#if CONFIG_NEW_INTER
+  vp9_prob inter_compound_mode_probs[INTER_MODE_CONTEXTS]
+                                    [INTER_COMPOUND_MODES - 1];
+#endif  // CONFIG_NEW_INTER
  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
+  vp9_prob single_ref_probs[REF_CONTEXTS][SINGLE_REFS - 1];
+  vp9_prob comp_ref_probs[REF_CONTEXTS][COMP_REFS - 1];
  struct tx_probs tx_probs;
  vp9_prob skip_probs[SKIP_CONTEXTS];
  nmv_context nmvc;
+#if CONFIG_SR_MODE
+  vp9_prob sr_probs[SR_CONTEXTS];
+#if SR_USE_MULTI_F
+  vp9_prob sr_usfilter_probs[SR_USFILTER_CONTEXTS][SR_USFILTER_NUM - 1];
+#endif  // SR_USE_MULTI_F
+#endif  // CONFIG_SR_MODE
+#if CONFIG_INTRABC
+  nmv_context ndvc;
+#endif  // CONFIG_INTRABC
+#if CONFIG_FILTERINTRA
+  vp9_prob filterintra_prob[TX_SIZES][INTRA_MODES];
+#endif  // CONFIG_FILTERINTRA
+#if CONFIG_EXT_TX
+#if CONFIG_WAVELETS
+  vp9_prob ext_tx_prob[TX_SIZES][EXT_TX_TYPES - 1];
+#else
+  vp9_prob ext_tx_prob[3][EXT_TX_TYPES - 1];
+#endif  // CONFIG_WAVELETS
+#endif  // CONFIG_EXT_TX
+#if CONFIG_PALETTE
+  vp9_prob palette_enabled_prob[10][3];
+  vp9_prob palette_uv_enabled_prob[2];
+  vp9_prob palette_size_prob[10][PALETTE_SIZES - 1];
+  vp9_prob palette_uv_size_prob[10][PALETTE_SIZES - 1];
+  vp9_prob palette_color_prob[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
+                                                    [PALETTE_COLORS - 1];
+  vp9_prob palette_uv_color_prob[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
+                                                       [PALETTE_COLORS - 1];
+#endif  // CONFIG_PALETTE
+#if CONFIG_SUPERTX
+  vp9_prob supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES];
+#endif  // CONFIG_SUPERTX
+#if CONFIG_TX_SKIP
+  vp9_prob y_tx_skip_prob[2];
+  vp9_prob uv_tx_skip_prob[2];
+  vp9_coeff_probs_pxd coef_probs_pxd[TX_SIZES][PLANE_TYPES];
+#endif  // CONFIG_TX_SKIP
+#if CONFIG_COPY_MODE
+  vp9_prob copy_noref_prob[COPY_MODE_CONTEXTS][BLOCK_SIZES];
+  vp9_prob copy_mode_probs_l2[COPY_MODE_CONTEXTS][1];
+  vp9_prob copy_mode_probs[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 2];
+#endif  // CONFIG_COPY_MODE
+#if CONFIG_INTERINTRA
+  vp9_prob interintra_prob[BLOCK_SIZES];
+#if CONFIG_WEDGE_PARTITION
+  vp9_prob wedge_interintra_prob[BLOCK_SIZES];
+#endif  // CONFIG_WEDGE_PARTITION
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  vp9_prob wedge_interinter_prob[BLOCK_SIZES];
+#endif  // CONFIG_WEDGE_PARTITION
+#if CONFIG_GLOBAL_MOTION
+  vp9_prob global_motion_types_prob[GLOBAL_MOTION_TYPES - 1];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_NEW_QUANT && QUANT_PROFILES > 1 && !Q_CTX_BASED_PROFILES
+  vp9_prob dq_profile_prob[QUANT_PROFILES - 1];
+#endif  // CONFIG_NEW_QUANT && QUANT_PROFILES > 1 && !Q_CTX_BASED_PROFILES
 } FRAME_CONTEXT;

 typedef struct {
  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+#if CONFIG_EXT_PARTITION
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+#endif
  vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
                         [COEF_BANDS][COEFF_CONTEXTS];
  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                [SWITCHABLE_FILTERS];
  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+#if CONFIG_NEW_INTER
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+#endif  // CONFIG_NEW_INTER
  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
-  unsigned int single_ref[REF_CONTEXTS][2][2];
-  unsigned int comp_ref[REF_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS-1][2];
+  unsigned int comp_ref[REF_CONTEXTS][COMP_REFS-1][2];
  struct tx_counts tx;
  unsigned int skip[SKIP_CONTEXTS][2];
  nmv_context_counts mv;
+#if CONFIG_SR_MODE
+  unsigned int sr[SR_CONTEXTS][2];
+#if SR_USE_MULTI_F
+  unsigned int sr_usfilters[SR_USFILTER_CONTEXTS][SR_USFILTER_NUM];
+#endif  // SR_USE_MULTI_F
+#endif  // CONFIG_SR_MODE
+#if CONFIG_INTRABC
+  nmv_context_counts dv;
+#endif  // CONFIG_INTRABC
+#if CONFIG_FILTERINTRA
+  unsigned int filterintra[TX_SIZES][INTRA_MODES][2];
+#endif  // CONFIG_FILTERINTRA
+#if CONFIG_EXT_TX
+#if CONFIG_WAVELETS
+  unsigned int ext_tx[TX_SIZES][EXT_TX_TYPES];
+#else
+  unsigned int ext_tx[3][EXT_TX_TYPES];
+#endif  // CONFIG_WAVELETS
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  unsigned int supertx[PARTITION_SUPERTX_CONTEXTS][TX_SIZES][2];
+  unsigned int supertx_size[BLOCK_SIZES];
+#endif  // CONFIG_SUPERTX
+#if CONFIG_TX_SKIP
+  unsigned int y_tx_skip[2][2];
+  unsigned int uv_tx_skip[2][2];
+  vp9_coeff_counts_pxd coef_pxd[TX_SIZES][PLANE_TYPES];
+  unsigned int eob_branch_pxd[TX_SIZES][PLANE_TYPES][REF_TYPES][COEFF_CONTEXTS];
+#endif  // CONFIG_TX_SKIP
+#if CONFIG_COPY_MODE
+  unsigned int copy_noref[COPY_MODE_CONTEXTS][BLOCK_SIZES][2];
+  unsigned int copy_mode_l2[COPY_MODE_CONTEXTS][2];
+  unsigned int copy_mode[COPY_MODE_CONTEXTS][COPY_MODE_COUNT - 1];
+#endif  // CONFIG_COPY_MODE
+#if CONFIG_INTERINTRA
+  unsigned int interintra[BLOCK_SIZES][2];
+#if CONFIG_WEDGE_PARTITION
+  unsigned int wedge_interintra[BLOCK_SIZES][2];
+#endif  // CONFIG_WEDGE_PARTITION
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  unsigned int wedge_interinter[BLOCK_SIZES][2];
+#endif  // CONFIG_WEDGE_PARTITION
+#if CONFIG_PALETTE
+  unsigned int y_palette_enabled[10][3][2];
+  unsigned int uv_palette_enabled[2][2];
+  unsigned int y_palette_size[10][PALETTE_SIZES];
+  unsigned int uv_palette_size[10][PALETTE_SIZES];
+#endif  // CONFIG_PALETTE
+#if CONFIG_GLOBAL_MOTION
+  unsigned int global_motion_types[GLOBAL_MOTION_TYPES];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_NEW_QUANT && QUANT_PROFILES > 1 && !Q_CTX_BASED_PROFILES
+  unsigned int dq_profile[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT && QUANT_PROFILES > 1 && !Q_CTX_BASED_PROFILES
 } FRAME_COUNTS;

 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                        [INTRA_MODES - 1];
+#if CONFIG_EXT_PARTITION
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                            [EXT_PARTITION_TYPES - 1];
+#else
 extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                            [PARTITION_TYPES - 1];
+#endif
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+#if CONFIG_SR_MODE && SR_USE_MULTI_F
+extern const vp9_tree_index vp9_sr_usfilter_tree[TREE_SIZE(SR_USFILTER_NUM)];
+#endif  // CONFIG_SR_MODE && SR_USE_MULTI_F
+#if CONFIG_EXT_PARTITION
+extern const vp9_tree_index vp9_ext_partition_tree
+                                [TREE_SIZE(EXT_PARTITION_TYPES)];
+#endif
 extern const vp9_tree_index vp9_switchable_interp_tree
                                [TREE_SIZE(SWITCHABLE_FILTERS)];
+#if CONFIG_EXT_TX
+extern const vp9_tree_index vp9_ext_tx_tree[TREE_SIZE(EXT_TX_TYPES)];
+#if CONFIG_WAVELETS
+extern const
+    vp9_tree_index vp9_ext_tx_large_tree[TREE_SIZE(EXT_TX_TYPES_LARGE)];
+#endif  // CONFIG_WAVELETS
+#endif  // CONFIG_EXT_TX
+#if CONFIG_PALETTE
+extern const vp9_tree_index vp9_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
+extern const vp9_tree_index vp9_palette_color_tree[TREE_SIZE(PALETTE_COLORS)];
+#endif  // CONFIG_PALETTE
+#if CONFIG_COPY_MODE
+extern const vp9_tree_index vp9_copy_mode_tree_l2[TREE_SIZE(2)];
+extern const vp9_tree_index vp9_copy_mode_tree[TREE_SIZE(COPY_MODE_COUNT - 1)];
+#endif  // CONFIG_COPY_MODE
+
+#if CONFIG_NEW_INTER
+extern const vp9_tree_index vp9_inter_compound_mode_tree
+                                [TREE_SIZE(INTER_COMPOUND_MODES)];
+#endif  // CONFIG_NEW_INTER
+
+#if CONFIG_NEW_QUANT && QUANT_PROFILES > 1 && !Q_CTX_BASED_PROFILES
+extern const vp9_tree_index vp9_dq_profile_tree[TREE_SIZE(QUANT_PROFILES)];
+#endif  // CONFIG_NEW_QUANT && QUANT_PROFILES > 1 && !Q_CTX_BASED_PROFILES

 void vp9_setup_past_independence(struct VP9Common *cm);
+#if CONFIG_ROW_TILE
+void vp9_dec_setup_past_independence(struct VP9Common *cm,
+                                     int dec_tile_row, int dec_tile_col);
+#endif

 void vp9_init_mode_probs(FRAME_CONTEXT *fc);

 void vp9_adapt_mode_probs(struct VP9Common *cm);

+#if CONFIG_TX64X64
+void tx_counts_to_branch_counts_64x64(const unsigned int *tx_count_64x64p,
+                                      unsigned int (*ct_64x64p)[2]);
+#endif
 void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]);
 void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
@ -99,8 +291,12 @@ static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
                                               const MODE_INFO *above_mi,
                                               const MODE_INFO *left_mi,
                                               int block) {
-  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+  PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+  PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+#if CONFIG_INTRABC
+  if (is_intrabc_mode(above)) above = DC_PRED;
+  if (is_intrabc_mode(left)) left = DC_PRED;
+#endif
  return vp9_kf_y_mode_prob[above][left];
 }

--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@ -8,6 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <math.h>
+
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"

@ -118,6 +120,19 @@ static const uint8_t log_in_base_2[] = {
  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
 };

+#if CONFIG_GLOBAL_MOTION
+const vp9_tree_index vp9_global_motion_types_tree
+          [TREE_SIZE(GLOBAL_MOTION_TYPES)] = {
+  -GLOBAL_ZERO, 2,
+  -GLOBAL_TRANSLATION, -GLOBAL_ROTZOOM
+};
+
+static const vp9_prob default_global_motion_types_prob
+                 [GLOBAL_MOTION_TYPES - 1] = {
+  224, 128
+};
+#endif  // CONFIG_GLOBAL_MOTION
+
 static INLINE int mv_class_base(MV_CLASS_TYPE c) {
  return c ? CLASS0_SIZE << (c + 2) : 0;
 }
@ -142,7 +157,7 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                             int incr, int usehp) {
  int s, z, c, o, d, e, f;
-  assert(v != 0);            /* should not be zero */
+  assert(v != 0);             /* should not be zero */
  s = v < 0;
  comp_counts->sign[s] += incr;
  z = (s ? -v : v) - 1;       /* magnitude - 1 */
@ -230,4 +245,10 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {

 void vp9_init_mv_probs(VP9_COMMON *cm) {
  cm->fc.nmvc = default_nmv_context;
+#if CONFIG_INTRABC
+  cm->fc.ndvc = default_nmv_context;
+#endif  // CONFIG_INTRABC
+#if CONFIG_GLOBAL_MOTION
+  vp9_copy(cm->fc.global_motion_types_prob, default_global_motion_types_prob);
+#endif  // CONFIG_GLOBAL_MOTION
 }
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@ -14,6 +14,7 @@

 #include "./vpx_config.h"

+#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_prob.h"

@ -127,6 +128,11 @@ typedef struct {

 void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);

+#if CONFIG_GLOBAL_MOTION
+extern const vp9_tree_index vp9_global_motion_types_tree
+                            [TREE_SIZE(GLOBAL_MOTION_TYPES)];
+#endif  // CONFIG_GLOBAL_MOTION
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@ -17,13 +17,22 @@
 extern "C" {
 #endif

+#if CONFIG_EXT_CODING_UNIT_SIZE
+#define CODING_UNIT_SIZE_LOG2 7
+#else
+#define CODING_UNIT_SIZE_LOG2 6
+#endif
+
+#define CODING_UNIT_SIZE (1 << CODING_UNIT_SIZE_LOG2)
+
 #define MI_SIZE_LOG2 3
-#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
+#define MI_BLOCK_SIZE_LOG2 (CODING_UNIT_SIZE_LOG2 - MI_SIZE_LOG2)

 #define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
 #define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block

 #define MI_MASK (MI_BLOCK_SIZE - 1)
+#define MI_MASK_2 (MI_BLOCK_SIZE * 2 - 1)

 // Bitstream profiles indicated by 2-3 bits in the uncompressed header.
 // 00: Profile 0.  8-bit 4:2:0 only.
@ -54,10 +63,31 @@ typedef enum BLOCK_SIZE {
  BLOCK_32X64,
  BLOCK_64X32,
  BLOCK_64X64,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  BLOCK_64X128,
+  BLOCK_128X64,
+  BLOCK_128X128,
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
  BLOCK_SIZES,
-  BLOCK_INVALID = BLOCK_SIZES
+  BLOCK_INVALID = BLOCK_SIZES,
+  BLOCK_LARGEST = BLOCK_SIZES - 1
 } BLOCK_SIZE;

+#if CONFIG_EXT_PARTITION
+typedef enum PARTITION_TYPE {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_HORZ_A,  // HORZ split and the left partition is split again
+  PARTITION_HORZ_B,  // HORZ split and the right partition is split again
+  PARTITION_VERT_A,  // VERT split and the top partition is split again
+  PARTITION_VERT_B,  // VERT split and the bottom partition is split again
+  EXT_PARTITION_TYPES,
+  PARTITION_TYPES = PARTITION_SPLIT + 1,
+  PARTITION_INVALID = EXT_PARTITION_TYPES
+} PARTITION_TYPE;
+#else
 typedef enum PARTITION_TYPE {
  PARTITION_NONE,
  PARTITION_HORZ,
@ -66,10 +96,15 @@ typedef enum PARTITION_TYPE {
  PARTITION_TYPES,
  PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
+#endif

 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
+#if CONFIG_EXT_CODING_UNIT_SIZE
+#define PARTITION_CONTEXTS (5 * PARTITION_PLOFFSET)
+#else
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#endif

 // block transform size
 typedef enum {
@ -77,17 +112,28 @@ typedef enum {
  TX_8X8 = 1,                      // 8x8 transform
  TX_16X16 = 2,                    // 16x16 transform
  TX_32X32 = 3,                    // 32x32 transform
+#if CONFIG_TX64X64
+  TX_64X64 = 4,                    // 64x64 transform
+#endif
  TX_SIZES
 } TX_SIZE;

+#define MAX_TX_SIZE_LOG2 (TX_SIZES + 1)
+#define MAX_MIN_TX_IN_BLOCK_LOG2 MAX((CODING_UNIT_SIZE_LOG2 - \
+                                      MAX_TX_SIZE_LOG2), 1)
+#define MAX_MIN_TX_IN_BLOCK (1 << MAX_MIN_TX_IN_BLOCK_LOG2)
+
 // frame transform mode
 typedef enum {
  ONLY_4X4            = 0,        // only 4x4 transform used
  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
-  TX_MODE_SELECT      = 4,        // transform specified for each block
-  TX_MODES            = 5,
+#if CONFIG_TX64X64
+  ALLOW_64X64         = 4,        // allow block transform size up to 32x32
+#endif
+  TX_MODE_SELECT,                 // transform specified for each block
+  TX_MODES,
 } TX_MODE;

 typedef enum {
@ -95,24 +141,89 @@ typedef enum {
  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
  ADST_ADST = 3,                      // ADST in both directions
-  TX_TYPES = 4
+  TX_TYPES,
+#if CONFIG_EXT_TX
+  FLIPADST_DCT = 4,
+  DCT_FLIPADST = 5,
+  FLIPADST_FLIPADST = 6,
+  ADST_FLIPADST = 7,
+  FLIPADST_ADST = 8,
+  DST_DST = 9,
+  DST_DCT = 10,
+  DCT_DST = 11,
+  DST_ADST = 12,
+  ADST_DST = 13,
+  DST_FLIPADST = 14,
+  FLIPADST_DST = 15,
+#if CONFIG_WAVELETS
+  WAVELET1_DCT_DCT,
+#endif  // CONFIG_WAVELETS
+  TOTAL_TX_TYPES,
+#endif  // CONFIG_EXT_TX
 } TX_TYPE;

+#if CONFIG_EXT_TX
 typedef enum {
-  UNKNOWN    = 0,
-  BT_601     = 1,  // YUV
-  BT_709     = 2,  // YUV
-  SMPTE_170  = 3,  // YUV
-  SMPTE_240  = 4,  // YUV
-  RESERVED_1 = 5,
-  RESERVED_2 = 6,
-  SRGB       = 7   // RGB
-} COLOR_SPACE;
+  NORM = 0,
+  ALT1 = 1,
+#if CONFIG_WAVELETS
+  EXT_TX_TYPES_LARGE = 2,
+#endif  // CONFIG_WAVELETS
+  ALT2 = 2,
+  ALT3 = 3,
+  ALT4 = 4,
+  ALT5 = 5,
+  ALT6 = 6,
+  ALT7 = 7,
+  ALT8 = 8,
+  ALT9 = 9,
+  ALT10 = 10,
+  ALT11 = 11,
+  ALT12 = 12,
+  ALT13 = 13,
+  ALT14 = 14,
+  ALT15 = 15,
+  EXT_TX_TYPES
+} EXT_TX_TYPE;
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_PALETTE
+typedef enum {
+  TWO_COLORS,
+  THREE_COLORS,
+  FOUR_COLORS,
+  FIVE_COLORS,
+  SIX_COLORS,
+  SEVEN_COLORS,
+  EIGHT_COLORS,
+  PALETTE_SIZES
+} PALETTE_SIZE;
+
+typedef enum {
+  PALETTE_COLOR_ONE,
+  PALETTE_COLOR_TWO,
+  PALETTE_COLOR_THREE,
+  PALETTE_COLOR_FOUR,
+  PALETTE_COLOR_FIVE,
+  PALETTE_COLOR_SIX,
+  PALETTE_COLOR_SEVEN,
+  PALETTE_COLOR_EIGHT,
+  PALETTE_COLORS
+} PALETTE_COLOR;
+#endif  // CONFIG_PALETTE

 typedef enum {
  VP9_LAST_FLAG = 1 << 0,
+#if CONFIG_MULTI_REF
+  VP9_LAST2_FLAG = 1 << 1,
+  VP9_LAST3_FLAG = 1 << 2,
+  VP9_LAST4_FLAG = 1 << 3,
+  VP9_GOLD_FLAG = 1 << 4,
+  VP9_ALT_FLAG = 1 << 5,
+#else  // CONFIG_MULTI_REF
  VP9_GOLD_FLAG = 1 << 1,
  VP9_ALT_FLAG = 1 << 2,
+#endif  // CONFIG_MULTI_REF
 } VP9_REFFRAME;

 #ifdef __cplusplus
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@ -34,6 +34,11 @@ extern "C" {
 #define dual_set_epi16(a, b) \
  _mm_set_epi16(b, b, b, b, a, a, a, a)

+#if CONFIG_TX_SKIP
+#define TX_SKIP_SHIFT_LQ 2
+#define TX_SKIP_SHIFT_HQ 3
+#endif
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
@ -116,17 +121,261 @@ typedef struct {
 } highbd_transform_2d;
 #endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) (x)
+#endif  // CONFIG_EMULATE_HARDWARE
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+  trans = WRAPLOW(trans, 8);
+  return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+
+#if CONFIG_EXT_TX
+#define USE_DST2 1
+
+#if USE_DST2
+static const tran_high_t Tx4[4 * 4] = {
+  // dst2
+  6270,  15137,  15137,   6270,
+  11585,  11585, -11585, -11585,
+  15137,  -6270,  -6270,  15137,
+  11585, -11585,  11585, -11585,
+};
+
+static const tran_high_t Tx8[8 * 8] = {
+  // dst2
+  3196,   9102,  13623,  16069,  16069,  13623,   9102,   3196,
+  6270,  15137,  15137,   6270,  -6270, -15137, -15137,  -6270,
+  9102,  16069,   3196, -13623, -13623,   3196,  16069,   9102,
+  11585,  11585, -11585, -11585,  11585,  11585, -11585, -11585,
+  13623,   3196, -16069,   9102,   9102, -16069,   3196,  13623,
+  15137,  -6270,  -6270,  15137, -15137,   6270,   6270, -15137,
+  16069, -13623,   9102,  -3196,  -3196,   9102, -13623,  16069,
+  11585, -11585,  11585, -11585,  11585, -11585,  11585, -11585,
+};
+
+static const tran_high_t Tx16[16 * 16] = {
+  // dst2
+  1606,   4756,   7723,  10394,  12665,  14449,  15679,  16305,
+  16305,  15679,  14449,  12665,  10394,   7723,   4756,   1606,
+  3196,   9102,  13623,  16069,  16069,  13623,   9102,   3196,
+  -3196,  -9102, -13623, -16069, -16069, -13623,  -9102,  -3196,
+  4756,  12665,  16305,  14449,   7723,  -1606, -10394, -15679,
+  -15679, -10394,  -1606,   7723,  14449,  16305,  12665,   4756,
+  6270,  15137,  15137,   6270,  -6270, -15137, -15137,  -6270,
+  6270,  15137,  15137,   6270,  -6270, -15137, -15137,  -6270,
+  7723,  16305,  10394,  -4756, -15679, -12665,   1606,  14449,
+  14449,   1606, -12665, -15679,  -4756,  10394,  16305,   7723,
+  9102,  16069,   3196, -13623, -13623,   3196,  16069,   9102,
+  -9102, -16069,  -3196,  13623,  13623,  -3196, -16069,  -9102,
+  10394,  14449,  -4756, -16305,  -1606,  15679,   7723, -12665,
+  -12665,   7723,  15679,  -1606, -16305,  -4756,  14449,  10394,
+  11585,  11585, -11585, -11585,  11585,  11585, -11585, -11585,
+  11585,  11585, -11585, -11585,  11585,  11585, -11585, -11585,
+  12665,   7723, -15679,  -1606,  16305,  -4756, -14449,  10394,
+  10394, -14449,  -4756,  16305,  -1606, -15679,   7723,  12665,
+  13623,   3196, -16069,   9102,   9102, -16069,   3196,  13623,
+  -13623,  -3196,  16069,  -9102,  -9102,  16069,  -3196, -13623,
+  14449,  -1606, -12665,  15679,  -4756, -10394,  16305,  -7723,
+  -7723,  16305, -10394,  -4756,  15679, -12665,  -1606,  14449,
+  15137,  -6270,  -6270,  15137, -15137,   6270,   6270, -15137,
+  15137,  -6270,  -6270,  15137, -15137,   6270,   6270, -15137,
+  15679, -10394,   1606,   7723, -14449,  16305, -12665,   4756,
+  4756, -12665,  16305, -14449,   7723,   1606, -10394,  15679,
+  16069, -13623,   9102,  -3196,  -3196,   9102, -13623,  16069,
+  -16069,  13623,  -9102,   3196,   3196,  -9102,  13623, -16069,
+  16305, -15679,  14449, -12665,  10394,  -7723,   4756,  -1606,
+  -1606,   4756,  -7723,  10394, -12665,  14449, -15679,  16305,
+  11585, -11585,  11585, -11585,  11585, -11585,  11585, -11585,
+  11585, -11585,  11585, -11585,  11585, -11585,  11585, -11585,
+};
+#endif  // USE_DST2
+
+static INLINE void vp9_fgentx4(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum;
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 4; ++i, Tx += 4) {
+    sum = Tx[0] * input[0] + Tx[1] * input[1] +
+          Tx[2] * input[2] + Tx[3] * input[3];
+    output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
+  }
+}
+
+static INLINE void vp9_fgentx8(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum;
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 8; ++i, Tx += 8) {
+    sum = Tx[0] * input[0] + Tx[1] * input[1] +
+          Tx[2] * input[2] + Tx[3] * input[3] +
+          Tx[4] * input[4] + Tx[5] * input[5] +
+          Tx[6] * input[6] + Tx[7] * input[7];
+    output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
+  }
+}
+
+static INLINE void vp9_fgentx16(const tran_low_t *input, tran_low_t *output,
+                                const tran_high_t *T) {
+  tran_high_t sum;
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 16; ++i, Tx += 16) {
+    sum = Tx[0] * input[0] + Tx[1] * input[1] +
+          Tx[2] * input[2] + Tx[3] * input[3] +
+          Tx[4] * input[4] + Tx[5] * input[5] +
+          Tx[6] * input[6] + Tx[7] * input[7] +
+          Tx[8] * input[8] + Tx[9] * input[9] +
+          Tx[10] * input[10] + Tx[11] * input[11] +
+          Tx[12] * input[12] + Tx[13] * input[13] +
+          Tx[14] * input[14] + Tx[15] * input[15];
+    output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
+  }
+}
+
+static INLINE void vp9_igentx4(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum[4];
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 4; ++i, ++Tx) {
+    sum[i] = Tx[0] * input[0] + Tx[4] * input[1] +
+             Tx[8] * input[2] + Tx[12] * input[3];
+  }
+  for (i = 0; i < 4; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
+  }
+}
+
+static INLINE void vp9_igentx8(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum[8];
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 8; ++i, ++Tx) {
+    sum[i] = Tx[0] * input[0] + Tx[8] * input[1] +
+             Tx[16] * input[2] + Tx[24] * input[3] +
+             Tx[32] * input[4] + Tx[40] * input[5] +
+             Tx[48] * input[6] + Tx[56] * input[7];
+  }
+  for (i = 0; i < 8; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
+  }
+}
+
+static INLINE void vp9_igentx16(const tran_low_t *input, tran_low_t *output,
+                                const tran_high_t *T) {
+  tran_high_t sum[16];
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 16; ++i, ++Tx) {
+    sum[i] = Tx[0] * input[0] + Tx[16] * input[1] +
+             Tx[32] * input[2] + Tx[48] * input[3] +
+             Tx[64] * input[4] + Tx[80] * input[5] +
+             Tx[96] * input[6] + Tx[112] * input[7] +
+             Tx[128] * input[8] + Tx[144] * input[9] +
+             Tx[160] * input[10] + Tx[176] * input[11] +
+             Tx[192] * input[12] + Tx[208] * input[13] +
+             Tx[224] * input[14] + Tx[240] * input[15];
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vp9_highbd_igentx4(const tran_low_t *input,
+                                      tran_low_t *output,
+                                      int bd, const tran_high_t *T) {
+  tran_high_t sum[4];
+  int i;
+  const tran_high_t *Tx = T;
+  (void) bd;
+  for (i = 0; i < 4; ++i, Tx += 1) {
+    sum[i] = Tx[0] * input[0] + Tx[4] * input[1] +
+             Tx[8] * input[2] + Tx[12] * input[3];
+  }
+  for (i = 0; i < 4; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
+  }
+}
+
+static INLINE void vp9_highbd_igentx8(const tran_low_t *input,
+                                      tran_low_t *output,
+                                      int bd, const tran_high_t *T) {
+  tran_high_t sum[8];
+  int i;
+  const tran_high_t *Tx = T;
+  (void) bd;
+  for (i = 0; i < 8; ++i, Tx += 1) {
+    sum[i] = Tx[0] * input[0] + Tx[8] * input[1] +
+             Tx[16] * input[2] + Tx[24] * input[3] +
+             Tx[32] * input[4] + Tx[40] * input[5] +
+             Tx[48] * input[6] + Tx[56] * input[7];
+  }
+  for (i = 0; i < 8; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
+  }
+}
+
+static INLINE void vp9_highbd_igentx16(const tran_low_t *input,
+                                       tran_low_t *output,
+                                       int bd, const tran_high_t *T) {
+  tran_high_t sum[16];
+  int i;
+  const tran_high_t *Tx = T;
+  (void) bd;
+  for (i = 0; i < 16; ++i, Tx += 1) {
+    sum[i] = Tx[0] * input[0] + Tx[16] * input[1] +
+             Tx[32] * input[2] + Tx[48] * input[3] +
+             Tx[64] * input[4] + Tx[80] * input[5] +
+             Tx[96] * input[6] + Tx[112] * input[7] +
+             Tx[128] * input[8] + Tx[144] * input[9] +
+             Tx[160] * input[10] + Tx[176] * input[11] +
+             Tx[192] * input[12] + Tx[208] * input[13] +
+             Tx[224] * input[14] + Tx[240] * input[15];
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
+
 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
-void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
-                       eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                       int eob);
-
+#if CONFIG_TX64X64
+void vp9_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+#endif  // CONFIG_TX64X64
 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                    int stride, int eob);
 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
@ -135,6 +384,9 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                      int stride, int eob);

 #if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                            int eob, int bd);
 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@ -145,15 +397,67 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, int bd);
 void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, int bd);
+#if CONFIG_TX64X64
+void vp9_highbd_idct64x64_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+#endif
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
                           uint8_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
                           uint8_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
                             uint8_t *dest, int stride, int eob, int bd);
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_TX_SKIP
+void vp9_tx_identity_add_rect(const tran_low_t *input, uint8_t *dest,
+                              int row, int col, int stride_in,
+                              int stride_out, int shift);
+void vp9_tx_identity_add(const tran_low_t *input, uint8_t *dest,
+                         int stride, int bs, int shift);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_tx_identity_add_rect(const tran_low_t *input, uint8_t *dest,
+                                     int row, int col, int stride_in,
+                                     int stride_out, int shift, int bd);
+void vp9_highbd_tx_identity_add(const tran_low_t *input, uint8_t *dest,
+                                int stride, int bs, int shift, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_TX_SKIP
+
+void vp9_dst1d_type1(int64_t *in, int64_t *out, int N);
+void vp9_idst4x4_add(const tran_low_t *input, uint8_t *dest, int stride);
+void vp9_idst8x8_add(const tran_low_t *input, uint8_t *dest, int stride);
+void vp9_idst16x16_add(const tran_low_t *input, uint8_t *dest, int stride);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

 #endif  // VP9_COMMON_VP9_IDCT_H_
+#if CONFIG_SR_MODE
+void vp9_iwht4x4(const tran_low_t *input, int16_t *dest, int stride,
+                 int eob);
+void vp9_idct4x4(const tran_low_t *input, int16_t *dest, int stride,
+                 int eob);
+void vp9_idct8x8(const tran_low_t *input, int16_t *dest, int stride,
+                 int eob);
+void vp9_idct16x16(const tran_low_t *input, int16_t *dest, int stride,
+                   int eob);
+void vp9_idct32x32(const tran_low_t *input, int16_t *dest, int stride,
+                   int eob);
+void vp9_iht4x4(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
+                int stride, int eob);
+void vp9_iht8x8(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
+                int stride, int eob);
+void vp9_iht16x16(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
+                  int stride, int eob);
+#if CONFIG_TX64X64
+void vp9_idct64x64(const tran_low_t *input, int16_t *dest, int stride,
+                   int eob);
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_SR_MODE
--- a/vp9/common/vp9_idwt.c
+++ b/vp9/common/vp9_idwt.c
@ -0,0 +1,407 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idwt.h"
+
+// Note: block length must be even for this implementation
+static void synthesis_53_row(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, *a, *b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *x++ = ((r = *a++) + 1) >> 1;
+    *x++ = *b++ + ((r + (*a) + 2) >> 2);
+  }
+  *x++ = ((r = *a) + 1) >> 1;
+  *x++ = *b + ((r + 1) >> 1);
+}
+
+static void synthesis_53_col(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, *a, *b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    r = *a++;
+    *x++ = r;
+    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
+  }
+  *x++ = *a;
+  *x++ = ((*b) << 1) + *a;
+}
+
+static void dyadic_synthesize_53(int levels, int width, int height,
+                                 tran_low_t *c, int pitch_c,
+                                 int16_t *x, int pitch_x,
+                                 int dwt_scale_bits) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+  const int dwt_scale_rnd = 1 << (dwt_scale_bits - 1);
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
+      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
+          ((c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits) :
+          -((-c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits);
+    }
+  }
+}
+
+// Note: block length must be even for this implementation
+static void synthesis_26_row(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, s, *a, *b;
+  int i, n = length >> 1;
+
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ += (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b += (r - *a + 4) >> 3;
+  }
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    s = *b++;
+    r = *a++;
+    *x++ = (r + s + 1) >> 1;
+    *x++ = (r - s + 1) >> 1;
+  }
+}
+
+static void synthesis_26_col(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, s, *a, *b;
+  int i, n = length >> 1;
+
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ += (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b += (r - *a + 4) >> 3;
+  }
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    s = *b++;
+    r = *a++;
+    *x++ = r + s;
+    *x++ = r - s;
+  }
+}
+
+static void dyadic_synthesize_26(int levels, int width, int height,
+                                 tran_low_t *c, int pitch_c,
+                                 int16_t *x, int pitch_x,
+                                 int dwt_scale_bits) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+  const int dwt_scale_rnd = 1 << (dwt_scale_bits - 1);
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
+      synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
+          ((c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits) :
+          -((-c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits);
+    }
+  }
+}
+
+static void synthesis_97(int length, double *lowpass, double *highpass,
+                         double *x) {
+  const double a_predict1 = -1.586134342;
+  const double a_update1 = -0.05298011854;
+  const double a_predict2 = 0.8829110762;
+  const double a_update2 = 0.4435068522;
+  const double s_low = 1.149604398;
+  const double s_high = 1/1.149604398;
+  const double inv_s_low = 1 / s_low;
+  const double inv_s_high = 1 / s_high;
+  int i;
+  double y[DWT_MAX_LENGTH];
+  // Undo pack and scale
+  for (i = 0; i < length / 2; i++) {
+    y[i * 2] = lowpass[i] * inv_s_low;
+    y[i * 2 + 1] = highpass[i] * inv_s_high;
+  }
+  memcpy(x, y, sizeof(*y) * length);
+  // Undo update 2
+  for (i = 2; i < length; i += 2) {
+    x[i] -= a_update2 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] -= 2 * a_update2 * x[1];
+  // Undo predict 2
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] -= 2 * a_predict2 * x[length - 2];
+  // Undo update 1
+  for (i = 2; i < length; i += 2) {
+    x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] -= 2 * a_update1 * x[1];
+  // Undo predict 1
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] -= 2 * a_predict1 * x[length - 2];
+}
+
+static void dyadic_synthesize_97(int levels, int width, int height,
+                                 tran_low_t *c, int pitch_c,
+                                 int16_t *x, int pitch_x,
+                                 int dwt_scale_bits) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  double buffer[2 * DWT_MAX_LENGTH];
+  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
+
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      y[i * DWT_MAX_LENGTH + j] = c[i * pitch_c + j];
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = y[i * DWT_MAX_LENGTH + j];
+      synthesis_97(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
+      synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
+    }
+  }
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
+                                 (1 << dwt_scale_bits));
+}
+
+void vp9_idwt32x32_c(const tran_low_t *input, tran_low_t *output, int stride) {
+  tran_low_t in[32 * 32];
+  vpx_memcpy(in, input, sizeof(in));
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(4, 32, 32, in, 32, output, stride, 2);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(4, 32, 32, in, 32, output, stride, 2);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(4, 32, 32, in, 32, output, stride, 2);
+#endif
+}
+
+void vp9_idwtdct32x32_c(const tran_low_t *input, tran_low_t *output,
+                        int stride) {
+  const int dwt_levels = 1;
+  tran_low_t buffer[16 * 16];
+  tran_low_t buffer2[32 * 32];
+  int i;
+  for (i = 0; i < 32; ++i) {
+    memcpy(&buffer2[i * 32], &input[i * 32], sizeof(buffer2[0]) * 32);
+  }
+  for (i = 0; i < 16; ++i) {
+    memcpy(&buffer[i * 16], &input[i * 32], sizeof(buffer[0]) * 16);
+  }
+  vp9_idct16x16_noscale(buffer, buffer2, 32);
+
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
+#endif
+}
+
+void vp9_idwt32x32_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_low_t output[32 * 32];
+  vp9_idwt32x32_c(input, output, 32);
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] =
+          clip_pixel_add(dest[j * stride + i], output[j * 32 + i]);
+    }
+  }
+}
+
+void vp9_idwtdct32x32_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int i, j;
+  tran_low_t output[32 * 32];
+  vp9_idwtdct32x32_c(input, output, 32);
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] =
+          clip_pixel_add(dest[j * stride + i], output[j * 32 + i]);
+    }
+  }
+}
+
+#if CONFIG_TX64X64
+void vp9_idwt64x64_c(const tran_low_t *input, tran_low_t *output, int stride) {
+  tran_low_t in[64 * 64];
+  vpx_memcpy(in, input, sizeof(in));
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(4, 64, 64, in, 64, output, stride, 1);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(4, 64, 64, in, 64, output, stride, 1);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(4, 64, 64, in, 64, output, stride, 1);
+#endif
+}
+
+void vp9_idwtdct64x64_c(const tran_low_t *input, tran_low_t *output,
+                        int stride) {
+  const int dwt_levels = 1;
+  tran_low_t buffer[32 * 32];
+  tran_low_t buffer2[64 * 64];
+  int i;
+  for (i = 0; i < 64; ++i) {
+    memcpy(&buffer2[i * 64], &input[i * 64], sizeof(buffer2[0]) * 64);
+  }
+  for (i = 0; i < 32; ++i) {
+    memcpy(&buffer[i * 32], &input[i * 64], sizeof(buffer[0]) * 32);
+  }
+  vp9_idct32x32_noscale(buffer, buffer2, 64);
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
+#endif
+}
+
+void vp9_idwt64x64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_low_t output[64 * 64];
+  vp9_idwt64x64_c(input, output, 64);
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      dest[j * stride + i] =
+          clip_pixel_add(dest[j * stride + i], output[j * 64 + i]);
+    }
+  }
+}
+
+void vp9_idwtdct64x64_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int i, j;
+  tran_low_t output[64 * 64];
+  vp9_idwtdct64x64_c(input, output, 64);
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      dest[j * stride + i] =
+          clip_pixel_add(dest[j * stride + i], output[j * 64 + i]);
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
--- a/vp9/common/vp9_idwt.h
+++ b/vp9/common/vp9_idwt.h
@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_IDWT_H_
+#define VP9_COMMON_VP9_IDWT_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_idct.h"
+
+#define DWT_MAX_LENGTH   64
+#define DWT_TYPE         53    // 26/53/97
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_TX64X64
+void vp9_idwt64x64(tran_low_t *input, tran_low_t *output, int stride);
+void vp9_idwtdct64x64(tran_low_t *input, tran_low_t *output, int stride);
+#endif  // CONFIG_TX64X64
+void vp9_idwt32x32(tran_low_t *input, tran_low_t *output, int stride);
+void vp9_idwtdct32x32(tran_low_t *input, tran_low_t *output, int stride);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_IDWT_H_
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -8,6 +8,9 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <math.h>
+
+
 #include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
@ -38,6 +41,9 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
  0xffffffffffffffff,  // TX_8x8
  0x5555555555555555,  // TX_16x16
  0x1111111111111111,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101010101010101,  // TX_64x64
+#endif
 };

 // 64 bit masks for above transform size. Each 1 represents a position where
@ -62,6 +68,9 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
  0xffffffffffffffff,  // TX_8x8
  0x00ff00ff00ff00ff,  // TX_16x16
  0x000000ff000000ff,  // TX_32x32
+#if CONFIG_TX64X64
+  0x00000000000000ff,  // TX_64x64
+#endif
 };

 // 64 bit masks for prediction sizes (left). Each 1 represents a position
@ -140,6 +149,9 @@ static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
  0xffff,  // TX_8x8
  0x5555,  // TX_16x16
  0x1111,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101,  // TX_64x64, never used
+#endif
 };

 static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
@ -147,6 +159,9 @@ static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
  0xffff,  // TX_8x8
  0x0f0f,  // TX_16x16
  0x000f,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0003,  // TX_64x64, never used
+#endif
 };

 // 16 bit left mask to shift and set for each uv prediction size.
@ -203,9 +218,258 @@ static const uint16_t above_border_uv = 0x000f;

 static const int mode_lf_lut[MB_MODE_COUNT] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
-  1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
+#if CONFIG_INTRABC
+  0,
+#endif  // CONFIG_INTRABC
+  1, 1, 0, 1,                    // INTER_MODES (ZEROMV == 0)
+#if CONFIG_NEW_INTER
+  1,                             // NEW2MV mode
+  1, 1, 1, 1, 1, 1, 1, 0, 1      // INTER_COMPOUND_MODES (ZERO_ZEROMV == 0)
+#endif  // CONFIG_NEW_INTER
 };

+#if CONFIG_LOOP_POSTFILTER
+
+#define BILATERAL_WEIGHT_BITS 4
+static double bilateral_filters_r_kf[BILATERAL_LEVELS_KF + 1][513];
+static double bilateral_filters_r[BILATERAL_LEVELS + 1][513];
+static double bilateral_filters_s_kf[BILATERAL_LEVELS_KF + 1]
+                                    [BILATERAL_WIN][BILATERAL_WIN];
+static double bilateral_filters_s[BILATERAL_LEVELS + 1]
+                                 [BILATERAL_WIN][BILATERAL_WIN];
+
+void vp9_loop_bilateral_precal() {
+  int i;
+
+  for (i = 1; i < BILATERAL_LEVELS_KF + 1; i ++) {
+    const bilateral_params_t param = vp9_bilateral_level_to_params(i, 1);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / BILATERAL_PRECISION;
+    const double sigma_x_d = (double)sigma_x / BILATERAL_PRECISION;
+    const double sigma_y_d = (double)sigma_y / BILATERAL_PRECISION;
+
+    double *fr = bilateral_filters_r_kf[i] + 256;
+    int j, x, y;
+    for (j = 0; j <= 256; j++) {
+      fr[j] = exp(-(j * j) / (2 * sigma_r_d * sigma_r_d));
+      fr[-j] = fr[j];
+    }
+    for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; y++) {
+      for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; x++) {
+        bilateral_filters_s_kf[i][y + BILATERAL_HALFWIN]
+                                 [x + BILATERAL_HALFWIN] =
+          exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
+              -(y * y) / (2 * sigma_y_d * sigma_y_d));
+      }
+    }
+  }
+
+
+  for (i = 1; i < BILATERAL_LEVELS + 1; i ++) {
+    const bilateral_params_t param = vp9_bilateral_level_to_params(i, 0);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / BILATERAL_PRECISION;
+    const double sigma_x_d = (double)sigma_x / BILATERAL_PRECISION;
+    const double sigma_y_d = (double)sigma_y / BILATERAL_PRECISION;
+
+    double *fr = bilateral_filters_r[i] + 256;
+    int j, x, y;
+    for (j = 0; j <= 256; j++) {
+      fr[j] = exp(-(j * j) / (2 * sigma_r_d * sigma_r_d));
+      fr[-j] = fr[j];
+    }
+    for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; y++) {
+      for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; x++) {
+        bilateral_filters_s[i][y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] =
+          exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
+              -(y * y) / (2 * sigma_y_d * sigma_y_d));
+      }
+    }
+  }
+}
+
+int vp9_bilateral_level_bits(const VP9_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME ?
+      BILATERAL_LEVEL_BITS_KF : BILATERAL_LEVEL_BITS;
+}
+
+int vp9_loop_bilateral_used(int level, int kf) {
+  const bilateral_params_t param = vp9_bilateral_level_to_params(level, kf);
+  return (param.sigma_x && param.sigma_y && param.sigma_r);
+}
+
+void vp9_loop_bilateral_init(loop_filter_info_n *lfi, int level, int kf) {
+  lfi->bilateral_used = vp9_loop_bilateral_used(level, kf);
+
+  if (lfi->bilateral_used) {
+    int i;
+    lfi->wr_lut = kf ? bilateral_filters_r_kf[level] :
+                       bilateral_filters_r[level];
+    for (i = 0; i < BILATERAL_WIN; i++)
+      lfi->wx_lut[i] = kf ? bilateral_filters_s_kf[level][i] :
+                            bilateral_filters_s[level][i];
+  }
+}
+
+static int is_in_image(int x, int y, int width, int height) {
+  return (x >= 0 && x < width && y >= 0 && y < height);
+}
+
+void loop_bilateral_filter(uint8_t *data, int width, int height,
+                           int stride, loop_filter_info_n *lfi,
+                           uint8_t *tmpdata, int tmpstride) {
+  int i, j;
+  const double *wr_lut_ = lfi->wr_lut + 256;
+
+  uint8_t *data_p = data;
+  uint8_t *tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int x, y;
+      double flsum = 0, wtsum = 0, wt;
+      uint8_t *data_p2 = data_p + j - BILATERAL_HALFWIN * stride;
+      for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) {
+        for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) {
+          if (!is_in_image(j + x, i + y, width, height))
+            continue;
+          wt = lfi->wx_lut[y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] *
+               wr_lut_[data_p2[x] - data_p[j]];
+          wtsum += wt;
+          flsum += wt * data_p2[x];
+        }
+        data_p2 += stride;
+      }
+      assert(wtsum > 0);
+      tmpdata_p[j] = clip_pixel((int)(flsum / wtsum + 0.5));
+    }
+    tmpdata_p += tmpstride;
+    data_p += stride;
+  }
+
+  for (i = 0; i < height; ++i) {
+    vpx_memcpy(data + i * stride, tmpdata + i * tmpstride,
+               width * sizeof(*data));
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void loop_bilateral_filter_highbd(uint8_t *data8, int width, int height,
+                           int stride, loop_filter_info_n *lfi,
+                           uint8_t *tmpdata8, int tmpstride, int bit_depth) {
+  int i, j;
+  const double *wr_lut_ = lfi->wr_lut + 256;
+
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+  uint16_t *data_p = data;
+  uint16_t *tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int x, y, diff_r;
+      double flsum = 0, wtsum = 0, wt;
+      uint16_t *data_p2 = data_p + j - BILATERAL_HALFWIN * stride;
+
+      for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) {
+        for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) {
+          if (!is_in_image(j + x, i + y, width, height))
+            continue;
+
+          diff_r = (data_p2[x] - data_p[j]) >> (bit_depth - 8);
+          assert(diff_r >= -256 && diff_r <= 256);
+
+          wt = lfi->wx_lut[y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] *
+               wr_lut_[diff_r];
+          wtsum += wt;
+          flsum += wt * data_p2[x];
+        }
+        data_p2 += stride;
+      }
+
+      assert(wtsum > 0);
+      tmpdata_p[j] = (int)(flsum / wtsum + 0.5);
+    }
+    tmpdata_p += tmpstride;
+    data_p += stride;
+  }
+
+  for (i = 0; i < height; ++i) {
+    vpx_memcpy(data + i * stride, tmpdata + i * tmpstride,
+               width * sizeof(*data));
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_loop_bilateral_rows(YV12_BUFFER_CONFIG *frame,
+                             VP9_COMMON *cm,
+                             int start_mi_row, int end_mi_row,
+                             int y_only) {
+  const int ywidth = frame->y_crop_width;
+  const int ystride = frame->y_stride;
+  const int uvwidth = frame->uv_crop_width;
+  const int uvstride = frame->uv_stride;
+  const int ystart = start_mi_row << MI_SIZE_LOG2;
+  const int uvstart = ystart >> cm->subsampling_y;
+  int yend = end_mi_row << MI_SIZE_LOG2;
+  int uvend = yend >> cm->subsampling_y;
+  YV12_BUFFER_CONFIG *tmp_buf;
+  yend = MIN(yend, cm->height);
+  uvend = MIN(uvend, cm->subsampling_y ? (cm->height + 1) >> 1 : cm->height);
+
+  if (vp9_realloc_frame_buffer(&cm->tmp_loop_buf, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                               0, NULL, NULL, NULL) < 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate post-processing buffer");
+
+  tmp_buf = &cm->tmp_loop_buf;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    loop_bilateral_filter_highbd(frame->y_buffer + ystart * ystride,
+                        ywidth, yend - ystart, ystride, &cm->lf_info,
+                        tmp_buf->y_buffer + ystart * tmp_buf->y_stride,
+                        tmp_buf->y_stride, cm->bit_depth);
+  else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  loop_bilateral_filter(frame->y_buffer + ystart * ystride,
+                        ywidth, yend - ystart, ystride, &cm->lf_info,
+                        tmp_buf->y_buffer + ystart * tmp_buf->y_stride,
+                        tmp_buf->y_stride);
+  if (!y_only) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    loop_bilateral_filter_highbd(frame->u_buffer + uvstart * uvstride,
+                          uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+                          tmp_buf->u_buffer + uvstart * tmp_buf->uv_stride,
+                          tmp_buf->uv_stride, cm->bit_depth);
+    loop_bilateral_filter_highbd(frame->v_buffer + uvstart * uvstride,
+                          uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+                          tmp_buf->v_buffer + uvstart * tmp_buf->uv_stride,
+                          tmp_buf->uv_stride, cm->bit_depth);
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    loop_bilateral_filter(frame->u_buffer + uvstart * uvstride,
+                          uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+                          tmp_buf->u_buffer + uvstart * tmp_buf->uv_stride,
+                          tmp_buf->uv_stride);
+    loop_bilateral_filter(frame->v_buffer + uvstart * uvstride,
+                          uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+                          tmp_buf->v_buffer + uvstart * tmp_buf->uv_stride,
+                          tmp_buf->uv_stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+#endif  // CONFIG_LOOP_POSTFILTER
+
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
  int lvl;

@ -246,6 +510,10 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
  // init hev threshold const vectors
  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+
+#if CONFIG_LOOP_POSTFILTER
+  vp9_loop_bilateral_precal();
+#endif  // CONFIG_LOOP_POSTFILTER
 }

 void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@ -702,14 +970,16 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
 // block we are currently looking at. Shift is used to position the
 // 1's we produce.
 // TODO(JBB) Need another function for different resolution color..
-static void build_masks(const loop_filter_info_n *const lfi_n,
+static void build_masks(const VP9_COMMON *const cm,
                        const MODE_INFO *mi, const int shift_y,
                        const int shift_uv,
                        LOOP_FILTER_MASK *lfm) {
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
  const MB_MODE_INFO *mbmi = &mi->mbmi;
  const BLOCK_SIZE block_size = mbmi->sb_type;
  const TX_SIZE tx_size_y = mbmi->tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(
+      tx_size_y, block_size, cm->subsampling_x, cm->subsampling_y);
  const int filter_level = get_filter_level(lfi_n, mbmi);
  uint64_t *const left_y = &lfm->left_y[tx_size_y];
  uint64_t *const above_y = &lfm->above_y[tx_size_y];
@ -782,12 +1052,21 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
 // This function does the same thing as the one above with the exception that
 // it only affects the y masks. It exists because for blocks < 16x16 in size,
 // we only update u and v masks on the first block.
-static void build_y_mask(const loop_filter_info_n *const lfi_n,
+static void build_y_mask(const VP9_COMMON *const cm,
                         const MODE_INFO *mi, const int shift_y,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
                         LOOP_FILTER_MASK *lfm) {
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const BLOCK_SIZE block_size = mbmi->sb_type;
  const TX_SIZE tx_size_y = mbmi->tx_size;
+#if CONFIG_SUPERTX
+  const BLOCK_SIZE block_size =
+      supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
+#else
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+#endif
  const int filter_level = get_filter_level(lfi_n, mbmi);
  uint64_t *const left_y = &lfm->left_y[tx_size_y];
  uint64_t *const above_y = &lfm->above_y[tx_size_y];
@ -829,7 +1108,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                    MODE_INFO *mi, const int mode_info_stride,
                    LOOP_FILTER_MASK *lfm) {
  int idx_32, idx_16, idx_8;
-  const loop_filter_info_n *const lfi_n = &cm->lf_info;
  MODE_INFO *mip = mi;
  MODE_INFO *mip2 = mi;

@ -866,23 +1144,36 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
  // through the recursive loop structure multiple times.
  switch (mip->mbmi.sb_type) {
    case BLOCK_64X64:
-      build_masks(lfi_n, mip , 0, 0, lfm);
+      build_masks(cm, mip, 0, 0, lfm);
      break;
    case BLOCK_64X32:
-      build_masks(lfi_n, mip, 0, 0, lfm);
+      build_masks(cm, mip, 0, 0, lfm);
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      if (supertx_enabled(&mip->mbmi))
+        break;
+#endif
      mip2 = mip + mode_info_stride * 4;
      if (4 >= max_rows)
        break;
-      build_masks(lfi_n, mip2, 32, 8, lfm);
+      build_masks(cm, mip2, 32, 8, lfm);
      break;
    case BLOCK_32X64:
-      build_masks(lfi_n, mip, 0, 0, lfm);
+      build_masks(cm, mip, 0, 0, lfm);
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      if (supertx_enabled(&mip->mbmi))
+        break;
+#endif
      mip2 = mip + 4;
      if (4 >= max_cols)
        break;
-      build_masks(lfi_n, mip2, 4, 2, lfm);
+      build_masks(cm, mip2, 4, 2, lfm);
      break;
    default:
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      if (mip->mbmi.tx_size == TX_64X64) {
+        build_masks(cm, mip, 0, 0, lfm);
+      } else {
+#endif
      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
        const int shift_y = shift_32_y[idx_32];
        const int shift_uv = shift_32_uv[idx_32];
@ -892,23 +1183,36 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
          continue;
        switch (mip->mbmi.sb_type) {
          case BLOCK_32X32:
-            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            build_masks(cm, mip, shift_y, shift_uv, lfm);
            break;
          case BLOCK_32X16:
-            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            build_masks(cm, mip, shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip->mbmi))
+              break;
+#endif
            if (mi_32_row_offset + 2 >= max_rows)
              continue;
            mip2 = mip + mode_info_stride * 2;
-            build_masks(lfi_n, mip2, shift_y + 16, shift_uv + 4, lfm);
+            build_masks(cm, mip2, shift_y + 16, shift_uv + 4, lfm);
            break;
          case BLOCK_16X32:
-            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            build_masks(cm, mip, shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip->mbmi))
+              break;
+#endif
            if (mi_32_col_offset + 2 >= max_cols)
              continue;
            mip2 = mip + 2;
-            build_masks(lfi_n, mip2, shift_y + 2, shift_uv + 1, lfm);
+            build_masks(cm, mip2, shift_y + 2, shift_uv + 1, lfm);
            break;
          default:
+#if CONFIG_SUPERTX
+            if (mip->mbmi.tx_size == TX_32X32) {
+              build_masks(cm, mip, shift_y, shift_uv, lfm);
+            } else {
+#endif
            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
@ -922,27 +1226,48 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,

              switch (mip->mbmi.sb_type) {
                case BLOCK_16X16:
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(cm, mip, shift_y, shift_uv, lfm);
                  break;
                case BLOCK_16X8:
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(cm, mip, shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip->mbmi))
+                    break;
+#endif
                  if (mi_16_row_offset + 1 >= max_rows)
                    continue;
                  mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2, shift_y+8, lfm);
+                  build_y_mask(cm, mip2, shift_y + 8,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                  break;
                case BLOCK_8X16:
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(cm, mip, shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip->mbmi))
+                    break;
+#endif
                  if (mi_16_col_offset +1 >= max_cols)
                    continue;
                  mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2, shift_y+1, lfm);
+                  build_y_mask(cm, mip2, shift_y + 1,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                  break;
                default: {
+#if CONFIG_SUPERTX
+                  if (mip->mbmi.tx_size == TX_16X16) {
+                    build_masks(cm, mip, shift_y, shift_uv, lfm);
+                  } else {
+#endif
                  const int shift_y = shift_32_y[idx_32] +
                                      shift_16_y[idx_16] +
                                      shift_8_y[0];
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(cm, mip, shift_y, shift_uv, lfm);
                  mip += offset[0];
                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
                    const int shift_y = shift_32_y[idx_32] +
@ -956,15 +1281,28 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                    if (mi_8_col_offset >= max_cols ||
                        mi_8_row_offset >= max_rows)
                      continue;
-                    build_y_mask(lfi_n, mip, shift_y, lfm);
+                    build_y_mask(cm, mip, shift_y,
+#if CONFIG_SUPERTX
+                                 supertx_enabled(&mip->mbmi),
+#endif
+                                 lfm);
                  }
+#if CONFIG_SUPERTX
+                  }
+#endif
                  break;
                }
              }
            }
+#if CONFIG_SUPERTX
+            }
+#endif
            break;
        }
      }
+#if CONFIG_SUPERTX && CONFIG_TX64X64
+      }
+#endif
      break;
  }
  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
@ -1193,7 +1531,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

      // Filter level can vary per MI
-      if (!(lfl[(r << 3) + (c >> ss_x)] =
+      if (!(lfl[(r << MI_BLOCK_SIZE_LOG2) + (c >> ss_x)] =
            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
        continue;

@ -1255,7 +1593,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                                     mask_8x8_c & border_mask,
                                     mask_4x4_c & border_mask,
                                     mask_4x4_int[r],
-                                     &cm->lf_info, &lfl[r << 3],
+                                     &cm->lf_info,
+                                     &lfl[r << MI_BLOCK_SIZE_LOG2],
                                     (int)cm->bit_depth);
    } else {
      filter_selectively_vert(dst->buf, dst->stride,
@ -1263,7 +1602,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                              mask_8x8_c & border_mask,
                              mask_4x4_c & border_mask,
                              mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
    }
 #else
    filter_selectively_vert(dst->buf, dst->stride,
@ -1271,7 +1610,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                            mask_8x8_c & border_mask,
                            mask_4x4_c & border_mask,
                            mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << 3]);
+                            &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    dst->buf += 8 * dst->stride;
    mi_8x8 += row_step_stride;
@ -1304,7 +1643,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                                      mask_8x8_r,
                                      mask_4x4_r,
                                      mask_4x4_int_r,
-                                      &cm->lf_info, &lfl[r << 3],
+                                      &cm->lf_info,
+                                      &lfl[r << MI_BLOCK_SIZE_LOG2],
                                      (int)cm->bit_depth);
    } else {
      filter_selectively_horiz(dst->buf, dst->stride,
@ -1312,7 +1652,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                               mask_8x8_r,
                               mask_4x4_r,
                               mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << 3]);
+                               &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
    }
 #else
    filter_selectively_horiz(dst->buf, dst->stride,
@ -1320,7 +1660,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                             mask_8x8_r,
                             mask_4x4_r,
                             mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << 3]);
+                             &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    dst->buf += 8 * dst->stride;
  }
@ -1576,8 +1916,12 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                          struct macroblockd_plane planes[MAX_MB_PLANE],
                          int start, int stop, int y_only) {
  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+#if CONFIG_EXT_PARTITION || CONFIG_EXT_CODING_UNIT_SIZE
+  const int use_420 = 0;
+#else
  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
                                 planes[1].subsampling_x == 1);
+#endif
  LOOP_FILTER_MASK lfm;
  int mi_row, mi_col;

@ -1610,7 +1954,8 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
                           int frame_filter_level,
                           int y_only, int partial_frame) {
  int start_mi_row, end_mi_row, mi_rows_to_filter;
-  if (!frame_filter_level) return;
+  if (!frame_filter_level)
+      return;
  start_mi_row = 0;
  mi_rows_to_filter = cm->mi_rows;
  if (partial_frame && cm->mi_rows > 8) {
@ -1619,15 +1964,45 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
  }
  end_mi_row = start_mi_row + mi_rows_to_filter;
-  vp9_loop_filter_frame_init(cm, frame_filter_level);
-  vp9_loop_filter_rows(frame, cm, xd->plane,
-                       start_mi_row, end_mi_row,
-                       y_only);
+  if (frame_filter_level) {
+    vp9_loop_filter_frame_init(cm, frame_filter_level);
+    vp9_loop_filter_rows(frame, cm, xd->plane,
+                         start_mi_row, end_mi_row,
+                         y_only);
+  }
 }

+#if CONFIG_LOOP_POSTFILTER
+void vp9_loop_bilateral_frame(YV12_BUFFER_CONFIG *frame,
+                              VP9_COMMON *cm,
+                              int bilateral_level,
+                              int y_only, int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  const int loop_bilateral_used = vp9_loop_bilateral_used(
+      bilateral_level, cm->frame_type == KEY_FRAME);
+  if (!loop_bilateral_used)
+    return;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  if (loop_bilateral_used) {
+    vp9_loop_bilateral_init(&cm->lf_info, bilateral_level,
+                            cm->frame_type == KEY_FRAME);
+    vp9_loop_bilateral_rows(frame, cm, start_mi_row, end_mi_row, y_only);
+  }
+}
+#endif  // CONFIG_LOOP_POSTFILTER
+
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
  (void)unused;
-  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                       lf_data->start, lf_data->stop, lf_data->y_only);
+  if (lf_data->cm->lf.filter_level) {
+    vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                         lf_data->start, lf_data->stop, lf_data->y_only);
+  }
  return 1;
 }
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@ -26,9 +26,79 @@ extern "C" {

 #define SIMD_WIDTH 16

+#if CONFIG_MULTI_REF
+#define MAX_REF_LF_DELTAS       7
+#else  // CONFIG_MULTI_REF
 #define MAX_REF_LF_DELTAS       4
+#endif  // CONFIG_MULTI_REF
+
 #define MAX_MODE_LF_DELTAS      2

+struct VP9Common;
+
+#if CONFIG_LOOP_POSTFILTER
+#define BILATERAL_LEVEL_BITS_KF 4
+#define BILATERAL_LEVELS_KF     (1 << BILATERAL_LEVEL_BITS_KF)
+#define BILATERAL_LEVEL_BITS    3
+#define BILATERAL_LEVELS        (1 << BILATERAL_LEVEL_BITS)
+#define DEF_BILATERAL_LEVEL     2
+
+#define BILATERAL_PRECISION     16
+#define BILATERAL_HALFWIN       3
+#define BILATERAL_WIN           (2 * BILATERAL_HALFWIN + 1)
+
+typedef struct bilateral_params {
+  int sigma_x;  // spatial variance x
+  int sigma_y;  // spatial variance y
+  int sigma_r;  // range variance
+} bilateral_params_t;
+
+static bilateral_params_t
+    bilateral_level_to_params_arr[BILATERAL_LEVELS + 1] = {
+  // Values are rounded to 1/16 th precision
+  {0, 0, 0},    // 0 - default
+  {8, 9, 30},
+  {9, 8, 30},
+  {9, 11, 32},
+  {11, 9, 32},
+  {14, 14, 32},
+  {18, 18, 36},
+  {24, 24, 40},
+  {32, 32, 40},
+};
+
+static bilateral_params_t
+    bilateral_level_to_params_arr_kf[BILATERAL_LEVELS_KF + 1] = {
+  // Values are rounded to 1/16 th precision
+  {0, 0, 0},    // 0 - default
+  {8, 8, 30},
+  {9, 9, 32},
+  {10, 10, 32},
+  {12, 12, 32},
+  {14, 14, 32},
+  {18, 18, 36},
+  {24, 24, 40},
+  {30, 30, 44},
+  {36, 36, 48},
+  {42, 42, 48},
+  {48, 48, 48},
+  {48, 48, 56},
+  {56, 56, 48},
+  {56, 56, 56},
+  {56, 56, 64},
+  {64, 64, 48},
+};
+
+int vp9_bilateral_level_bits(const struct VP9Common *const cm);
+int vp9_loop_bilateral_used(int level, int kf);
+
+static INLINE bilateral_params_t vp9_bilateral_level_to_params(
+    int index, int kf) {
+  return kf ? bilateral_level_to_params_arr_kf[index] :
+              bilateral_level_to_params_arr[index];
+}
+#endif  // CONFIG_LOOP_POSTFILTER
+
 struct loopfilter {
  int filter_level;

@ -38,13 +108,19 @@ struct loopfilter {
  uint8_t mode_ref_delta_enabled;
  uint8_t mode_ref_delta_update;

-  // 0 = Intra, Last, GF, ARF
+  // 0 = Intra, Last, Last2+Last3+Last4(CONFIG_MULTI_REF),
+  // GF, ARF
  signed char ref_deltas[MAX_REF_LF_DELTAS];
  signed char last_ref_deltas[MAX_REF_LF_DELTAS];

  // 0 = ZERO_MV, MV
  signed char mode_deltas[MAX_MODE_LF_DELTAS];
  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+
+#if CONFIG_LOOP_POSTFILTER
+  int bilateral_level;
+  int last_bilateral_level;
+#endif
 };

 // Need to align this structure so when it is declared and
@ -58,6 +134,14 @@ typedef struct {
 typedef struct {
  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
  uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+#if CONFIG_LOOP_POSTFILTER
+  double * wx_lut[BILATERAL_WIN];
+  double * wr_lut;
+  int bilateral_sigma_x_set;
+  int bilateral_sigma_y_set;
+  int bilateral_sigma_r_set;
+  int bilateral_used;
+#endif
 } loop_filter_info_n;

 // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
@ -76,12 +160,11 @@ typedef struct {
  uint16_t left_uv[TX_SIZES];
  uint16_t above_uv[TX_SIZES];
  uint16_t int_4x4_uv;
-  uint8_t lfl_y[64];
-  uint8_t lfl_uv[16];
+  uint8_t lfl_y[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
+  uint8_t lfl_uv[MI_BLOCK_SIZE / 2 * MI_BLOCK_SIZE / 2];
 } LOOP_FILTER_MASK;

 /* assorted loopfilter functions which get used elsewhere */
-struct VP9Common;
 struct macroblockd;
 struct VP9LfSyncData;

@ -115,6 +198,24 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                          struct VP9Common *cm,
                          struct macroblockd_plane planes[MAX_MB_PLANE],
                          int start, int stop, int y_only);
+#if CONFIG_LOOP_POSTFILTER
+void vp9_loop_bilateral_frame(YV12_BUFFER_CONFIG *frame,
+                              struct VP9Common *cm,
+                              int bilateral_level,
+                              int y_only, int partial_frame);
+void vp9_loop_filter_bilateral_frame(YV12_BUFFER_CONFIG *frame,
+                                     struct VP9Common *cm,
+                                     struct macroblockd *mbd,
+                                     int frame_filter_level,
+                                     int bilateral_level,
+                                     int y_only, int partial_frame);
+void vp9_loop_bilateral_init(loop_filter_info_n *lfi, int T, int kf);
+void vp9_loop_bilateral_rows(YV12_BUFFER_CONFIG *frame,
+                             struct VP9Common *cm,
+                             int start_mi_row, int end_mi_row,
+                             int y_only);
+#endif  // CONFIG_LOOP_POSTFILTER
+

 typedef struct LoopFilterWorkerData {
  YV12_BUFFER_CONFIG *frame_buffer;
--- a/vp9/common/vp9_motion_model.c
+++ b/vp9/common/vp9_motion_model.c
@ -0,0 +1,386 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be
+ *  found  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "vp9/common/vp9_common_data.h"
+#include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_motion_model.h"
+
+inline projectPointsType get_projectPointsType(TransformationType type) {
+  switch (type) {
+    case HOMOGRAPHY:
+      return projectPointsHomography;
+    case AFFINE:
+      return projectPointsAffine;
+    case ROTZOOM:
+      return projectPointsRotZoom;
+    case TRANSLATION:
+      return projectPointsTranslation;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+void projectPointsTranslation(double *mat, double *points, double *proj,
+                              const int n,
+                              const int stride_points,
+                              const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = x + mat[0];
+    *(proj++) = y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void projectPointsRotZoom(double *mat, double *points,
+                          double *proj, const int n,
+                          const int stride_points, const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) =  mat[0] * x + mat[1] * y + mat[2];
+    *(proj++) = -mat[1] * x + mat[0] * y + mat[3];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void projectPointsAffine(double *mat, double *points,
+                         double *proj, const int n,
+                         const int stride_points, const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[0] * x + mat[1] * y + mat[4];
+    *(proj++) = mat[2] * x + mat[3] * y + mat[5];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void projectPointsHomography(double *mat, double *points,
+                             double *proj, const int n,
+                             const int stride_points, const int stride_proj) {
+  int i;
+  double x, y, Z;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    Z = 1. / (mat[6] * x + mat[7] * y + mat[8]);
+    *(proj++) = (mat[0] * x + mat[1] * y + mat[2]) * Z;
+    *(proj++) = (mat[3] * x + mat[4] * y + mat[5]) * Z;
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+#define clip_pixel(v) ((v) < 0 ? 0 : ((v) > 255 ? 255 : (v)))
+
+double getCubicValue(double p[4], double x) {
+  return p[1] + 0.5 * x * (p[2] - p[0]
+          + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3]
+          + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+void get_subcolumn(unsigned char *ref, double col[4],
+                   int stride, int x, int y_start) {
+  int i;
+  for (i = 0; i < 4; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+double bicubic(unsigned char *ref, double x, double y, int stride) {
+  double arr[4];
+  int k;
+  int i = (int) x;
+  int j = (int) y;
+  for (k = 0; k < 4; ++k) {
+    double arr_temp[4];
+    get_subcolumn(ref, arr_temp, stride, i + k - 1, j - 1);
+    arr[k] = getCubicValue(arr_temp, y - j);
+  }
+  return getCubicValue(arr, x - i);
+}
+
+unsigned char interpolate(unsigned char *ref, double x, double y,
+                          int width, int height, int stride) {
+  if (x < 0 && y < 0) return ref[0];
+  else if (x < 0 && y > height - 1)
+    return ref[(height - 1) * stride];
+  else if (x > width - 1 && y < 0)
+    return ref[width - 1];
+  else if (x > width - 1 && y > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (x < 0) {
+    int v;
+    int i = (int) y;
+    double a = y - i;
+    if (y > 1 && y < height - 2) {
+      double arr[4];
+      get_subcolumn(ref, arr, stride, 0, i - 1);
+      return clip_pixel(getCubicValue(arr, a));
+    }
+    v = (int)(ref[i * stride] * (1 - a) + ref[(i + 1) * stride] * a + 0.5);
+    return clip_pixel(v);
+  } else if (y < 0) {
+    int v;
+    int j = (int) x;
+    double b = x - j;
+    if (x > 1 && x < width - 2) {
+      double arr[4] = {ref[j - 1], ref[j], ref[j + 1], ref[j + 2]};
+      return clip_pixel(getCubicValue(arr, b));
+    }
+    v = (int)(ref[j] * (1 - b) + ref[j + 1] * b + 0.5);
+    return clip_pixel(v);
+  } else if (x > width - 1) {
+    int v;
+    int i = (int) y;
+    double a = y - i;
+    if (y > 1 && y < height - 2) {
+      double arr[4];
+      get_subcolumn(ref, arr, stride, width - 1, i - 1);
+      return clip_pixel(getCubicValue(arr, a));
+    }
+    v = (int)(ref[i * stride + width - 1] * (1 - a) +
+                  ref[(i + 1) * stride + width - 1] * a + 0.5);
+    return clip_pixel(v);
+  } else if (y > height - 1) {
+    int v;
+    int j = (int) x;
+    double b = x - j;
+    if (x > 1 && x < width - 2) {
+      int row = (height - 1) * stride;
+      double arr[4] = {ref[row + j - 1], ref[row + j],
+                      ref[row + j + 1], ref[row + j + 2]};
+      return clip_pixel(getCubicValue(arr, b));
+    }
+    v = (int)(ref[(height - 1) * stride + j] * (1 - b) +
+                  ref[(height - 1) * stride + j + 1] * b + 0.5);
+    return clip_pixel(v);
+  } else if (x > 1 && y > 1 && x < width -2 && y < height -2) {
+    return clip_pixel(bicubic(ref, x, y, stride));
+  } else {
+    int i = (int) y;
+    int j = (int) x;
+    double a = y - i;
+    double b = x - j;
+    int v = (int)(ref[i * stride + j] * (1 - a) * (1 - b) +
+                  ref[i * stride + j + 1] * (1 - a) * b +
+                  ref[(i + 1) * stride + j] * a * (1 - b) +
+                  ref[(i + 1) * stride + j + 1] * a * b);
+    return clip_pixel(v);
+  }
+}
+
+static void WarpImage(TransformationType type, double *H,
+                      unsigned char *ref,
+                      int width, int height, int stride,
+                      unsigned char *pred,
+                      int p_col, int p_row,
+                      int p_width, int p_height, int p_stride,
+                      int subsampling_col, int subsampling_row,
+                      int x_scale, int y_scale) {
+  int i, j;
+  projectPointsType projectPoints = get_projectPointsType(type);
+  if (projectPoints == NULL)
+    return;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      double in[2], out[2];
+      in[0] = subsampling_col ? 2 * j + 0.5 : j;
+      in[1] = subsampling_row ? 2 * i + 0.5 : i;
+      projectPoints(H, in, out, 1, 2, 2);
+      out[0] = subsampling_col ? (out[0] - 0.5) / 2.0 : out[0];
+      out[1] = subsampling_row ? (out[1] - 0.5) / 2.0 : out[1];
+      out[0] *= x_scale / 16.0;
+      out[1] *= y_scale / 16.0;
+      pred[(j - p_col) + (i - p_row) * p_stride] =
+          interpolate(ref, out[0], out[1], width, height, stride);
+    }
+  }
+}
+
+double compute_warp_and_error(Global_Motion_Params *gm,
+                            projectPointsType projectPoints,
+                            unsigned char *ref,
+                            int width, int height, int stride,
+                            unsigned char *src,
+                            int p_col, int p_row,
+                            int p_width, int p_height, int p_stride,
+                            int subsampling_col, int subsampling_row,
+                            int x_scale, int y_scale) {
+  double H[9];
+  int i, j;
+  int64_t sumerr = 0;
+  if (projectPoints == NULL)
+    return -1;
+  vp9_convert_params_to_rotzoom(gm, H);
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      double in[2], out[2];
+      uint8_t pred;
+      int err;
+      in[0] = subsampling_col ? 2 * j + 0.5 : j;
+      in[1] = subsampling_row ? 2 * i + 0.5 : i;
+      projectPoints(H, in, out, 1, 2, 2);
+      out[0] = subsampling_col ? (out[0] - 0.5) / 2.0 : out[0];
+      out[1] = subsampling_row ? (out[1] - 0.5) / 2.0 : out[1];
+      out[0] *= x_scale / 16.0;
+      out[1] *= y_scale / 16.0;
+      pred = interpolate(ref, out[0], out[1], width, height, stride);
+      err = pred - src[(j - p_col) + (i - p_row) * p_stride];
+      sumerr += err * err;
+    }
+  }
+  return sumerr/(width * height);
+}
+
+// Computes the ratio of the warp error to the zero motion error
+double vp9_warp_erroradv_unq(TransformationType type, double *H,
+                             unsigned char *ref,
+                             int width, int height, int stride,
+                             unsigned char *src,
+                             int p_col, int p_row,
+                             int p_width, int p_height, int p_stride,
+                             int subsampling_col, int subsampling_row,
+                             int x_scale, int y_scale) {
+  double H_z_translation[] = {0, 0};
+  double H_z_rotzoom[] = {1, 0, 0, 0};
+  double H_z_affine[] = {1, 0, 0, 1, 0, 0};
+  double H_z_homography[] = {1, 0, 0, 0, 1, 0, 0, 0, 1};
+  double *H_z = H_z_rotzoom;
+  int i, j;
+  int64_t sumerr = 0;
+  int64_t sumerr_z = 0;
+  projectPointsType projectPoints = get_projectPointsType(type);
+  if (type == TRANSLATION)
+    H_z = H_z_translation;
+  else if (type == ROTZOOM)
+    H_z = H_z_rotzoom;
+  else if (type == AFFINE)
+    H_z = H_z_affine;
+  else if (type == HOMOGRAPHY)
+    H_z = H_z_homography;
+  else
+    assert(0 && "Unknown TransformationType");
+  if (projectPoints == NULL)
+    return -1;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      double in[2], out[2], out_z[2];
+      uint8_t pred, pred_z;
+      int err, err_z;
+      in[0] = subsampling_col ? 2 * j + 0.5 : j;
+      in[1] = subsampling_row ? 2 * i + 0.5 : i;
+      projectPoints(H, in, out, 1, 2, 2);
+      out[0] = subsampling_col ? (out[0] - 0.5) / 2.0 : out[0];
+      out[1] = subsampling_row ? (out[1] - 0.5) / 2.0 : out[1];
+      out[0] *= x_scale / 16.0;
+      out[1] *= y_scale / 16.0;
+      pred = interpolate(ref, out[0], out[1], width, height, stride);
+      err = pred - src[(j - p_col) + (i - p_row) * p_stride];
+      sumerr += err * err;
+      projectPoints(H_z, in, out_z, 1, 2, 2);
+      out_z[0] = subsampling_col ? (out_z[0] - 0.5) / 2.0 : out_z[0];
+      out_z[1] = subsampling_row ? (out_z[1] - 0.5) / 2.0 : out_z[1];
+      out_z[0] *= x_scale / 16.0;
+      out_z[1] *= y_scale / 16.0;
+      pred_z = interpolate(ref, out_z[0], out_z[1], width, height, stride);
+      err_z = pred_z - src[(j - p_col) + (i - p_row) * p_stride];
+      sumerr_z += err_z * err_z;
+    }
+  }
+  return (double)sumerr / sumerr_z;
+}
+
+void vp9_convert_params_to_rotzoom(Global_Motion_Params *model,
+                                   double *H) {
+  double z = (double) model->zoom / (1 << ZOOM_PRECISION_BITS);
+  double r = (double) model->rotation / (1 << ROTATION_PRECISION_BITS);
+  H[0] =  (1 + z) * cos(r * M_PI / 180.0);
+  H[1] = -(1 + z) * sin(r * M_PI / 180.0);
+  H[2] = (double) model->mv.as_mv.col / 8.0;
+  H[3] = (double) model->mv.as_mv.row / 8.0;
+}
+
+void vp9_warp_plane(Global_Motion_Params *gm,
+                    unsigned char *ref,
+                    int width, int height, int stride,
+                    unsigned char *pred,
+                    int p_col, int p_row,
+                    int p_width, int p_height, int p_stride,
+                    int subsampling_col, int subsampling_row,
+                    int x_scale, int y_scale) {
+  double H[9];
+  vp9_convert_params_to_rotzoom(gm, H);
+  WarpImage(ROTZOOM, H,
+            ref, width, height, stride,
+            pred, p_col, p_row, p_width, p_height, p_stride,
+            subsampling_col,  subsampling_row,
+            x_scale, y_scale);
+}
+
+double vp9_warp_erroradv(Global_Motion_Params *gm,
+                         unsigned char *ref,
+                         int width, int height, int stride,
+                         unsigned char *src,
+                         int p_col, int p_row,
+                         int p_width, int p_height, int p_stride,
+                         int subsampling_col, int subsampling_row,
+                         int x_scale, int y_scale) {
+  double H[9];
+  vp9_convert_params_to_rotzoom(gm, H);
+  return vp9_warp_erroradv_unq(ROTZOOM, H,
+                               ref, width, height, stride,
+                               src, p_col, p_row, p_width, p_height, p_stride,
+                               subsampling_col,  subsampling_row,
+                               x_scale, y_scale);
+}
+
+static int_mv vp9_get_global_mv(int col, int row, Global_Motion_Params *model) {
+    int_mv mv;
+  double H[4];
+  double x, y;
+  vp9_convert_params_to_rotzoom(model, H);
+  x =  H[0] * col + H[1] * row + H[2];
+  y = -H[1] * col + H[0] * row + H[3];
+  mv.as_mv.col = (int)floor(x * 8 + 0.5) - col;
+  mv.as_mv.row = (int)floor(y * 8 + 0.5) - row;
+  return mv;
+}
+
+int_mv vp9_get_global_sb_center_mv(int col, int row, int bw, int bh,
+                                   Global_Motion_Params *model) {
+  col += bw / 2;
+  row += bh / 2;
+  return vp9_get_global_mv(col, row, model);
+}
+
+int_mv vp9_get_global_sub8x8_center_mv(int col, int row, int block,
+                                       Global_Motion_Params *model) {
+  if (block == 0 || block == 2)
+    col += 2;
+  else
+    col += 6;
+  if (block == 0 || block == 1)
+    row += 2;
+  else
+    row += 6;
+  return vp9_get_global_mv(col, row, model);
+}
--- a/vp9/common/vp9_motion_model.h
+++ b/vp9/common/vp9_motion_model.h
@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be
+ *  found  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MOTION_MODEL_H
+#define VP9_COMMON_VP9_MOTION_MODEL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_mv.h"
+
+typedef void (*projectPointsType)(double *mat, double *points, double *proj,
+                                  const int n, const int stride_points,
+                                  const int stride_proj);
+typedef enum {
+  UNKNOWN_TRANSFORM = -1,
+  HOMOGRAPHY,  // homography, 8-parameter
+  AFFINE,      // affine, 6-parameter
+  ROTZOOM,     // simplified affine with rotation and zoom only, 4-parameter
+  TRANSLATION  // translational motion 2-parameter
+} TransformationType;
+
+static INLINE int get_numparams(TransformationType type) {
+  switch (type) {
+    case HOMOGRAPHY:
+      return 9;
+    case AFFINE:
+      return 6;
+    case ROTZOOM:
+      return 4;
+    case TRANSLATION:
+      return 2;
+    default:
+      assert(0);
+      return 0;
+  }
+}
+
+void projectPointsHomography(double *mat, double *points, double *proj,
+                             const int n, const int stride_points,
+                             const int stride_proj);
+void projectPointsAffine(double *mat, double *points, double *proj,
+                         const int n, const int stride_points,
+                         const int stride_proj);
+void projectPointsRotZoom(double *mat, double *points, double *proj,
+                          const int n, const int stride_points,
+                          const int stride_proj);
+void projectPointsTranslation(double *mat, double *points, double *proj,
+                              const int n, const int stride_points,
+                              const int stride_proj);
+
+projectPointsType get_projectPointsType(TransformationType type);
+
+void vp9_convert_params_to_rotzoom(Global_Motion_Params *model, double *H);
+
+void vp9_warp_plane(Global_Motion_Params *gm,
+                    unsigned char *ref,
+                    int width, int height, int stride,
+                    unsigned char *pred,
+                    int p_col, int p_row,
+                    int p_width, int p_height, int p_stride,
+                    int subsampling_col, int subsampling_row,
+                    int x_scale, int y_scale);
+
+double vp9_warp_erroradv(Global_Motion_Params *gm,
+                         unsigned char *ref,
+                         int width, int height, int stride,
+                         unsigned char *src,
+                         int p_col, int p_row,
+                         int p_width, int p_height, int p_stride,
+                         int subsampling_col, int subsampling_row,
+                         int x_scale, int y_scale);
+double vp9_warp_erroradv_unq(TransformationType type, double *H,
+                             unsigned char *ref,
+                             int width, int height, int stride,
+                             unsigned char *src,
+                             int p_col, int p_row,
+                             int p_width, int p_height, int p_stride,
+                             int subsampling_col, int subsampling_row,
+                             int x_scale, int y_scale);
+
+double compute_warp_and_error(Global_Motion_Params *gm,
+                              projectPointsType projectPoints,
+                              unsigned char *ref,
+                              int width, int height, int stride,
+                              unsigned char *src,
+                              int p_col, int p_row,
+                              int p_width, int p_height, int p_stride,
+                              int subsampling_col, int subsampling_row,
+                              int x_scale, int y_scale);
+
+unsigned char interpolate(unsigned char *ref, double x, double y,
+                          int width, int height, int stride);
+
+
+int_mv vp9_get_global_sb_center_mv(int col, int row, int bw, int bh,
+                                   Global_Motion_Params *model);
+int_mv vp9_get_global_sub8x8_center_mv(int col, int row, int block,
+                                       Global_Motion_Params *model);
+
+#endif  // VP9_COMMON_VP9_MOTION_MODEL_H
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@ -48,6 +48,40 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
  mv->row = clamp(mv->row, min_row, max_row);
 }

+#if CONFIG_GLOBAL_MOTION
+#define MAX_GLOBAL_MOTION_MODELS  1
+
+#define ZOOM_PRECISION_BITS       11
+#define ROTATION_PRECISION_BITS   11
+
+#define ABS_ZOOM_BITS             11
+#define ABS_ROTATION_BITS         11
+#define ABS_TRANSLATION_BITS      11
+
+typedef enum {
+  GLOBAL_ZERO = 0,
+  GLOBAL_TRANSLATION = 1,
+  GLOBAL_ROTZOOM = 2,
+  GLOBAL_MOTION_TYPES
+} GLOBAL_MOTION_TYPE;
+
+// Currently this is specialized for rotzoom model only
+typedef struct {
+  GLOBAL_MOTION_TYPE gmtype;
+  int rotation;   // positive or negative rotation angle in degrees
+  int zoom;       // this is actually the zoom multiplier minus 1
+  int_mv mv;
+} Global_Motion_Params;
+
+static INLINE GLOBAL_MOTION_TYPE get_gmtype(const Global_Motion_Params *gm) {
+  if (gm->rotation == 0 && gm->zoom == 0) {
+    return (gm->mv.as_int == 0 ? GLOBAL_ZERO : GLOBAL_TRANSLATION);
+  } else {
+    return GLOBAL_ROTZOOM;
+  }
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@ -1,4 +1,3 @@
-
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
@ -11,6 +10,260 @@

 #include "vp9/common/vp9_mvref_common.h"

+#if CONFIG_NEW_INTER && CONFIG_NEWMVREF
+// This function returns either the appropriate subblock or block's mv,
+// depending on whether block_size < 8x8 for both current block and the
+// examined candidate block.
+static int_mv get_subblock_mv(const MODE_INFO *candidate,
+                              const MODE_INFO *current,
+                              int curr_blk_idx, int ref,
+                              int search_row, int search_col) {
+  int candidate_type = candidate->mbmi.sb_type;
+
+  if (curr_blk_idx >= 0 && candidate_type < BLOCK_8X8) {
+    int candidate_blk_idx = 0;
+    assert(current->mbmi.sb_type < BLOCK_8X8);
+
+    // Both current block and the candidate block are in sub8x8 mode
+    if ((search_row == -1 && search_col == 0) ||  // top
+        (search_row == 0 && search_col == -1)) {  // left
+      int i = curr_blk_idx + current->mbmi.sb_type * 4;
+      int j = (search_row == 0);  // top: 0; left: 1
+
+      candidate_blk_idx = idx_to_subblock_top_left[i][j][candidate_type];
+      return (candidate_blk_idx >= 0) ?
+          candidate->bmi[candidate_blk_idx].as_mv[ref] :
+          candidate->mbmi.mv[ref];
+    } else if ((search_row == -1 && search_col ==  1) ||  // top_right
+               (search_row == -1 && search_col == -1)) {  // top_left
+      candidate_blk_idx =
+          idx_to_subblock_topright_topleft[search_col == -1][candidate_type];
+      return candidate->bmi[candidate_blk_idx].as_mv[ref];
+    }
+  }
+
+  return candidate->mbmi.mv[ref];
+}
+
+static int get_mvref_zone_idx(const TileInfo *const tile, int bsize,
+                              int mi_row, int mi_col) {
+  int mvref_zone_idx = 0;
+  int row_8x8 = mi_row % 8;
+  int col_8x8 = mi_col % 8;
+
+  switch (bsize) {
+    case BLOCK_4X4:
+    case BLOCK_4X8:
+    case BLOCK_8X4:
+    case BLOCK_8X8:
+      mvref_zone_idx =
+          (mi_col >= (tile->mi_col_end - 1) ||  // right-most column
+           (mv_ref_topright_avail_8x8[row_8x8][col_8x8] == 0)) ? 1 : 0;
+      break;
+    default:
+      // Only <= BLOCK_8X8 are supported currently
+      assert(0);
+      break;
+  }
+  return mvref_zone_idx;
+}
+
+// This function searches the neighbourhood of a given MB/SB
+// to try to find candidate reference vectors.
+static void find_mv_refs_idx_8x8(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                 const TileInfo *const tile,
+                                 MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                                 int_mv *mv_ref_list,
+                                 int block, int mi_row, int mi_col) {
+  int_mv mv_ref_candidates[MAX_MV_REF_CANDIDATES + 1];
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i;
+  int refmv_count = 0;
+  int different_ref_found = 0;
+
+  int zone_idx = get_mvref_zone_idx(tile, mi->mbmi.sb_type, mi_row, mi_col);
+  int max_nearest_blks = (zone_idx == 0) ? 4 : 3;
+  const POSITION *mv_ref_search = mv_ref_blocks_8x8[zone_idx];
+
+  // Zero out the mv reference vector list
+  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+  vpx_memset(mv_ref_candidates, 0,
+             sizeof(*mv_ref_candidates) * (MAX_MV_REF_CANDIDATES + 1));
+
+  // The nearest 4 (when top right is available) or 3 neighboring blocks
+  // are treated differently:
+  //   If their block size < 8x8, we get the mv from the bmi substructure.
+  for (i = 0; i < max_nearest_blks; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride].src_mi;
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_CANDIDATE(get_subblock_mv(
+            candidate_mi, mi, block, 0, mv_ref->row, mv_ref->col));
+      } else if (candidate->ref_frame[1] == ref_frame) {
+        ADD_MV_REF_CANDIDATE(get_subblock_mv(
+            candidate_mi, mi, block, 1, mv_ref->row, mv_ref->col));
+      }
+    }
+  }
+
+  // Check the rest of the neighbors in much the same way as before
+  // except we don't need to keep track of subblocks.
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MB_MODE_INFO *const candidate =
+          &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride].src_mi->mbmi;
+
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_CANDIDATE(candidate->mv[0]);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_CANDIDATE(candidate->mv[1]);
+    }
+  }
+
+  // Since we couldn't find 3 mvs from the same reference frame,
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MB_MODE_INFO *const candidate =
+            &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride].src_mi->mbmi;
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV_CANDIDATE(candidate);
+      }
+    }
+  }
+
+ Done:
+
+  if (refmv_count == 2) {
+    mv_ref_list[0].as_mv.row =
+        (mv_ref_candidates[0].as_mv.row + mv_ref_candidates[1].as_mv.row) >> 1;
+    mv_ref_list[0].as_mv.col =
+        (mv_ref_candidates[0].as_mv.col + mv_ref_candidates[1].as_mv.col) >> 1;
+    mv_ref_list[1].as_int = mv_ref_candidates[2].as_int;
+  } else {
+    for (i = 0; i < 2; ++i) {
+      mv_ref_list[i].as_int = mv_ref_candidates[i].as_int;
+    }
+  }
+
+  // Clamp vectors
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+}
+
+typedef enum MV_SEARCH_POS {
+  TOP            = 0,
+  LEFT           = 1,
+  TOPLEFT        = 2,
+  TOPRIGHT       = 3,
+  TOPRIGHT_ALT   = 4,
+  NUM_SEARCH_POS = 5
+} MV_SEARCH_POS;
+
+// Adaptive median
+static int get_adaptive_median(int topright, int left, int topleft) {
+  int a = topright;
+  int b = left;
+  int c = topright + left - topleft;
+
+  if (a >= b) {
+    if (b >= c)      return b;
+    else if (a >= c) return c;
+    else             return a;
+  } else {
+    if (b < c)       return b;
+    else if (a >= c) return a;
+    else             return c;
+  }
+}
+
+// This function searches the neighbourhood of a given MB/SB to try
+// to find the nearestmv through adaptive median filtering.
+static int find_best_mvref_8x8(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                               const TileInfo *const tile,
+                               MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                               int_mv *best_mvref,
+                               int block, int mi_row, int mi_col) {
+  int i;
+  int zone_idx = get_mvref_zone_idx(tile, mi->mbmi.sb_type, mi_row, mi_col);
+  int max_nearest_blks = (zone_idx == 0) ? 4 : 3;
+
+  const POSITION adapt_median_neighbor_pos[NUM_SEARCH_POS - 1] = {
+    // TOP, LEFT, TOPLEFT, TOPRIGHT
+    {-1, 0}, {0, -1}, {-1, -1}, {-1, 1}
+  };
+  int_mv mv_ref_mvs[NUM_SEARCH_POS];
+  int is_avail[NUM_SEARCH_POS] = { 0, 0, 0, 0, 0 };
+
+  vpx_memset(mv_ref_mvs, 0, sizeof(mv_ref_mvs[0]) * NUM_SEARCH_POS);
+
+  // If the neighboring block size < 8x8, the mv is obtained from
+  // the bmi substructure.
+  for (i = 0; i < max_nearest_blks; ++i) {
+    const POSITION *const mv_ref_pos = &adapt_median_neighbor_pos[i];
+
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref_pos)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref_pos->col + mv_ref_pos->row * xd->mi_stride].src_mi;
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+
+      if (candidate->ref_frame[0] == ref_frame) {
+        mv_ref_mvs[i] = get_subblock_mv(candidate_mi, mi, block, 0,
+                                        mv_ref_pos->row, mv_ref_pos->col);
+        is_avail[i] = 1;
+      } else if (candidate->ref_frame[1] == ref_frame) {
+        mv_ref_mvs[i] = get_subblock_mv(candidate_mi, mi, block, 1,
+                                        mv_ref_pos->row, mv_ref_pos->col);
+        is_avail[i] = 1;
+      }
+    }
+  }
+
+  if (is_avail[TOP] && is_avail[TOPRIGHT]) {
+    mv_ref_mvs[TOPRIGHT_ALT].as_mv.row =
+        (mv_ref_mvs[TOP].as_mv.row + mv_ref_mvs[TOPRIGHT].as_mv.row) >> 1;
+    mv_ref_mvs[TOPRIGHT_ALT].as_mv.col =
+        (mv_ref_mvs[TOP].as_mv.col + mv_ref_mvs[TOPRIGHT].as_mv.col) >> 1;
+  } else if (is_avail[TOP]) {
+    mv_ref_mvs[TOPRIGHT_ALT].as_int = mv_ref_mvs[TOP].as_int;
+  } else if (is_avail[TOPRIGHT]) {
+    mv_ref_mvs[TOPRIGHT_ALT].as_int = mv_ref_mvs[TOPRIGHT].as_int;
+  }
+
+  if (is_avail[TOP] || is_avail[LEFT] || is_avail[TOPLEFT] ||
+      is_avail[TOPRIGHT]) {
+    best_mvref->as_mv.row = get_adaptive_median(
+        mv_ref_mvs[TOPRIGHT_ALT].as_mv.row,
+        mv_ref_mvs[LEFT].as_mv.row,
+        mv_ref_mvs[TOPLEFT].as_mv.row);
+    best_mvref->as_mv.col = get_adaptive_median(
+        mv_ref_mvs[TOPRIGHT_ALT].as_mv.col,
+        mv_ref_mvs[LEFT].as_mv.col,
+        mv_ref_mvs[TOPLEFT].as_mv.col);
+
+    // Clamp vectors
+    clamp_mv_ref(&(best_mvref->as_mv), xd);
+    return 1;
+  } else {
+    best_mvref->as_int = 0;
+    return 0;
+  }
+}
+#endif  // CONFIG_NEW_INTER && CONFIG_NEWMVREF
+
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
@ -23,10 +276,12 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
  const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi
        ? cm->prev_mi[mi_row * xd->mi_stride + mi_col].src_mi
        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->src_mi->mbmi : NULL;
+  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
  int different_ref_found = 0;
+#if !CONFIG_NEW_INTER
  int context_counter = 0;
+#endif  // !CONFIG_NEW_INTER

  // Blank the reference vector list
  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
@ -41,13 +296,16 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                                                   xd->mi_stride].src_mi;
      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
      // Keep counts for entropy encoding.
+#if !CONFIG_NEW_INTER
      context_counter += mode_2_counter[candidate->mode];
+#endif  // !CONFIG_NEW_INTER
      different_ref_found = 1;

-      if (candidate->ref_frame[0] == ref_frame)
+      if (candidate->ref_frame[0] == ref_frame) {
        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block));
-      else if (candidate->ref_frame[1] == ref_frame)
+      } else if (candidate->ref_frame[1] == ref_frame) {
        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block));
+      }
    }
  }

@ -98,30 +356,82 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,

 Done:

+#if !CONFIG_NEW_INTER
  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+#endif  // !CONFIG_NEW_INTER

  // Clamp vectors
  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
 }

+#if CONFIG_NEW_INTER
+// This function keeps a mode count for a given MB/SB
+void vp9_update_mv_context(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                           const TileInfo *const tile,
+                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                           int_mv *mv_ref_list,
+                           int block, int mi_row, int mi_col) {
+  int i, refmv_count = 0;
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  int context_counter = 0;
+
+  // Blank the reference vector list
+  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are examined only.
+  // If the size < 8x8, we get the mv from the bmi substructure;
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride].src_mi;
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block));
+      } else if (candidate->ref_frame[1] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block));
+      }
+    }
+  }
+
+ Done:
+
+  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+}
+#endif  // CONFIG_NEW_INTER
+
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                      const TileInfo *const tile,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                      int_mv *mv_ref_list,
                      int mi_row, int mi_col) {
+#if CONFIG_NEW_INTER
+  vp9_update_mv_context(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
+                        mi_row, mi_col);
+#if CONFIG_NEWMVREF
+  if (mi->mbmi.sb_type <= BLOCK_8X8) {
+    int_mv best_mvref;
+    find_best_mvref_8x8(cm, xd, tile, mi, ref_frame, &best_mvref,
+                        -1, mi_row, mi_col);
+    find_mv_refs_idx_8x8(cm, xd, tile, mi, ref_frame, mv_ref_list,
+                         -1, mi_row, mi_col);
+    if (best_mvref.as_int != 0) {
+      mv_ref_list[1].as_int = mv_ref_list[0].as_int;
+      mv_ref_list[0].as_int = best_mvref.as_int;
+    }
+  } else {
+#endif  // CONFIG_NEWMVREF
+#endif  // CONFIG_NEW_INTER
  find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
                   mi_row, mi_col);
-}
-
-static void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
-  if (!use_hp) {
-    if (mv->row & 1)
-      mv->row += (mv->row > 0 ? -1 : 1);
-    if (mv->col & 1)
-      mv->col += (mv->col > 0 ? -1 : 1);
+#if CONFIG_NEW_INTER && CONFIG_NEWMVREF
  }
+#endif  // CONFIG_NEW_INTER && CONFIG_NEWMVREF
 }

 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
@ -129,8 +439,10 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
  int i;
  // Make sure all the candidates are properly clamped etc
  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
-    clamp_mv2(&mvlist[i].as_mv, xd);
+    MV *mv = &mvlist[i].as_mv;
+    const int usehp = allow_hp && vp9_use_mv_hp(mv);
+    vp9_lower_mv_precision(mv, usehp);
+    clamp_mv2(mv, xd);
  }
  *nearest = mvlist[0];
  *near = mvlist[1];
@ -139,33 +451,151 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   const TileInfo *const tile,
                                   int block, int ref, int mi_row, int mi_col,
+#if CONFIG_NEW_INTER
+                                   int_mv *mv_list,
+#endif  // CONFIG_NEW_INTER
                                   int_mv *nearest, int_mv *near) {
+#if CONFIG_NEW_INTER
+#if CONFIG_NEWMVREF
+  int_mv best_mvref;
+#endif  // CONFIG_NEWMVREF
+#else
  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+#endif  // !CONFIG_NEW_INTER
  MODE_INFO *const mi = xd->mi[0].src_mi;
  b_mode_info *bmi = mi->bmi;
  int n;

  assert(MAX_MV_REF_CANDIDATES == 2);

-  find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col);
+#if CONFIG_NEW_INTER && CONFIG_NEWMVREF
+  find_best_mvref_8x8(cm, xd, tile, mi, mi->mbmi.ref_frame[ref],
+                      &best_mvref, block, mi_row, mi_col);
+  find_mv_refs_idx_8x8(cm, xd, tile, mi, mi->mbmi.ref_frame[ref],
+                       mv_list, block, mi_row, mi_col);
+#else
+  find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref],
+                   mv_list, block, mi_row, mi_col);
+#endif  // CONFIG_NEW_INTER && CONFIG_NEWMVREF

  near->as_int = 0;
+
  switch (block) {
    case 0:
-      nearest->as_int = mv_list[0].as_int;
-      near->as_int = mv_list[1].as_int;
+#if CONFIG_NEW_INTER && CONFIG_NEWMVREF
+      if (best_mvref.as_int != 0) {
+        nearest->as_int = best_mvref.as_int;
+        if (best_mvref.as_int != mv_list[0].as_int)
+          near->as_int = mv_list[0].as_int;
+        else
+          near->as_int = mv_list[1].as_int;
+      } else {
+#endif  // CONFIG_NEW_INTER && CONFIG_NEWMVREF
+        nearest->as_int = mv_list[0].as_int;
+        near->as_int = mv_list[1].as_int;
+#if CONFIG_NEW_INTER && CONFIG_NEWMVREF
+      }
+#endif  // CONFIG_NEW_INTER && CONFIG_NEWMVREF
      break;
    case 1:
+#if !CONFIG_NEW_INTER
    case 2:
+#endif  // !CONFIG_NEW_INTER
      nearest->as_int = bmi[0].as_mv[ref].as_int;
+#if CONFIG_NEW_INTER && CONFIG_NEWMVREF
+      if (best_mvref.as_int != 0 &&
+          best_mvref.as_int != nearest->as_int)
+        near->as_int = best_mvref.as_int;
+      else
+#endif  // CONFIG_NEW_INTER && CONFIG_NEWMVREF
      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
        if (nearest->as_int != mv_list[n].as_int) {
          near->as_int = mv_list[n].as_int;
          break;
        }
      break;
+#if CONFIG_NEW_INTER
+    case 2: {
+#if CONFIG_NEWMVREF
+      if (bmi[0].as_mv[ref].as_int !=
+          bmi[1].as_mv[ref].as_int) {
+        // Average of TOP and TOPRIGHT
+        nearest->as_mv.row = (
+            bmi[0].as_mv[ref].as_mv.row +
+            bmi[1].as_mv[ref].as_mv.row) >> 1;
+        nearest->as_mv.col = (
+            bmi[0].as_mv[ref].as_mv.col +
+            bmi[1].as_mv[ref].as_mv.col) >> 1;
+        near->as_int = bmi[0].as_mv[ref].as_int;
+      } else {
+        nearest->as_int = bmi[0].as_mv[ref].as_int;
+        if (best_mvref.as_int != 0 &&
+            best_mvref.as_int != nearest->as_int) {
+          near->as_int = best_mvref.as_int;
+        } else {
+          for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
+            if (nearest->as_int != mv_list[n].as_int) {
+              near->as_int = mv_list[n].as_int;
+              break;
+            }
+        }
+      }
+#else
+      int_mv candidates[1 + MAX_MV_REF_CANDIDATES];
+      candidates[0] = bmi[1].as_mv[ref];
+      candidates[1] = mv_list[0];
+      candidates[2] = mv_list[1];
+
+      nearest->as_int = bmi[0].as_mv[ref].as_int;
+      for (n = 0; n < 1 + MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest->as_int != candidates[n].as_int) {
+          near->as_int = candidates[n].as_int;
+          break;
+        }
+#endif  // CONFIG_NEWMVREF
+      break;
+    }
+#endif  // CONFIG_NEW_INTER
    case 3: {
+#if CONFIG_NEW_INTER && CONFIG_NEWMVREF
+      if (bmi[0].as_mv[ref].as_int != bmi[1].as_mv[ref].as_int ||
+          bmi[0].as_mv[ref].as_int != bmi[2].as_mv[ref].as_int ||
+          bmi[1].as_mv[ref].as_int != bmi[2].as_mv[ref].as_int) {
+        nearest->as_mv.row = get_adaptive_median(
+            bmi[1].as_mv[ref].as_mv.row,
+            bmi[2].as_mv[ref].as_mv.row,
+            bmi[0].as_mv[ref].as_mv.row);
+        nearest->as_mv.col = get_adaptive_median(
+            bmi[1].as_mv[ref].as_mv.col,
+            bmi[2].as_mv[ref].as_mv.col,
+            bmi[0].as_mv[ref].as_mv.col);
+        /*nearest->as_mv.row =
+            (bmi[0].as_mv[ref].as_mv.row +
+             bmi[1].as_mv[ref].as_mv.row +
+             (bmi[2].as_mv[ref].as_mv.row << 1)) >> 2;
+        nearest->as_mv.col =
+            (bmi[0].as_mv[ref].as_mv.col +
+             bmi[1].as_mv[ref].as_mv.col +
+             (bmi[2].as_mv[ref].as_mv.col << 1)) >> 2;*/
+        for (n = 2; n >= 0; --n)
+          if (nearest->as_int != bmi[n].as_mv[ref].as_int) {
+            near->as_int = bmi[n].as_mv[ref].as_int;
+            break;
+          }
+      } else {
+        nearest->as_int = bmi[2].as_mv[ref].as_int;
+        if (best_mvref.as_int != 0 &&
+            best_mvref.as_int != nearest->as_int) {
+          near->as_int = best_mvref.as_int;
+        } else {
+          for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
+            if (nearest->as_int != mv_list[n].as_int) {
+              near->as_int = mv_list[n].as_int;
+              break;
+          }
+        }
+      }
+#else
      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
      candidates[0] = bmi[1].as_mv[ref];
      candidates[1] = bmi[0].as_mv[ref];
@ -178,9 +608,222 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
          near->as_int = candidates[n].as_int;
          break;
        }
+#endif  // CONFIG_NEW_INTER && CONFIG_NEWMREF
      break;
    }
    default:
-      assert("Invalid block index.");
+      assert(0 && "Invalid block index.");
  }
 }
+
+#if CONFIG_COPY_MODE
+static int compare_interinfo(MB_MODE_INFO *mbmi, MB_MODE_INFO *ref_mbmi) {
+  if (mbmi == ref_mbmi) {
+    return 1;
+  } else {
+    int is_same;
+#if CONFIG_INTERINTRA
+    MV_REFERENCE_FRAME mbmi_ref1_backup = mbmi->ref_frame[1];
+    MV_REFERENCE_FRAME refmbmi_ref1_backup = ref_mbmi->ref_frame[1];
+
+    if (mbmi->ref_frame[1] == INTRA_FRAME)
+      mbmi->ref_frame[1] = NONE;
+    if (ref_mbmi->ref_frame[1] == INTRA_FRAME)
+      ref_mbmi->ref_frame[1] = NONE;
+#endif  // CONFIG_INTERINTRA
+    if (mbmi->ref_frame[0] == ref_mbmi->ref_frame[0] &&
+        mbmi->ref_frame[1] == ref_mbmi->ref_frame[1]) {
+      if (mbmi->ref_frame[1] > INTRA_FRAME)
+        is_same = mbmi->mv[0].as_int == ref_mbmi->mv[0].as_int &&
+                  mbmi->mv[1].as_int == ref_mbmi->mv[1].as_int &&
+                  mbmi->interp_filter == ref_mbmi->interp_filter;
+      else
+        is_same = mbmi->mv[0].as_int == ref_mbmi->mv[0].as_int &&
+                  mbmi->interp_filter == ref_mbmi->interp_filter;
+    } else {
+      is_same = 0;
+    }
+#if CONFIG_INTERINTRA
+    mbmi->ref_frame[1] = mbmi_ref1_backup;
+    ref_mbmi->ref_frame[1] = refmbmi_ref1_backup;
+#endif  // CONFIG_INTERINTRA
+
+    return is_same;
+  }
+}
+
+static int check_inside(const TileInfo *const tile, int mi_row, int mi_col) {
+  return mi_row >= tile->mi_row_start && mi_col >= tile->mi_col_start &&
+         mi_row < tile->mi_row_end && mi_col < tile->mi_col_end;
+}
+
+static int is_right_available(BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION
+                              PARTITION_TYPE partition,
+#endif
+                              int mi_row, int mi_col) {
+  int depth, max_depth = (CODING_UNIT_SIZE_LOG2 - 2) -
+          MIN(b_width_log2_lookup[bsize], b_height_log2_lookup[bsize]);
+  int block[(CODING_UNIT_SIZE_LOG2 - 2)] = {0};
+
+  if (bsize == BLOCK_LARGEST)
+    return 1;
+  mi_row = mi_row % MI_BLOCK_SIZE;
+  mi_col = mi_col % MI_BLOCK_SIZE;
+  for (depth = 1; depth <= max_depth; depth++) {
+    block[depth] = (mi_row >> (MI_BLOCK_SIZE_LOG2 - depth)) * 2 +
+                   (mi_col >> (MI_BLOCK_SIZE_LOG2 - depth));
+    mi_row = mi_row % (MI_BLOCK_SIZE >> depth);
+    mi_col = mi_col % (MI_BLOCK_SIZE >> depth);
+  }
+
+  if (b_width_log2_lookup[bsize] < b_height_log2_lookup[bsize]) {
+    if (block[max_depth] == 0)
+      return 1;
+  } else if (b_width_log2_lookup[bsize] > b_height_log2_lookup[bsize]) {
+    if (block[max_depth] > 0)
+      return 0;
+  } else {
+#if CONFIG_EXT_PARTITION
+    if (block[max_depth] == 0)
+      return 1;
+    if (block[max_depth] == 2)
+      return partition != PARTITION_VERT_A;
+#else
+    if (block[max_depth] == 0 || block[max_depth] == 2)
+      return 1;
+#endif
+    else if (block[max_depth] == 3)
+      return 0;
+  }
+
+  for (depth = max_depth - 1; depth > 0; depth--) {
+    if (block[depth] == 0 || block[depth] == 2)
+      return 1;
+    else if (block[depth] == 3)
+      return 0;
+  }
+  return 1;
+}
+
+static int is_second_rec(int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+
+  if (bw < bh)
+    return (mi_col << 3) % (bw << 1) == 0 ? 0 : 1;
+  else if (bh < bw)
+    return (mi_row << 3) % (bh << 1) == 0 ? 0 : 2;
+  else
+    return 0;
+}
+
+int vp9_construct_ref_inter_list(VP9_COMMON *cm,  MACROBLOCKD *xd,
+                                 const TileInfo *const tile,
+                                 BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION
+                                 PARTITION_TYPE partition,
+#endif
+                                 int mi_row, int mi_col,
+                                 MB_MODE_INFO *ref_list[2 *
+                                                        (MI_BLOCK_SIZE + 1)]) {
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+  int row_offset, col_offset;
+  int mi_offset;
+  MB_MODE_INFO *ref_mbmi;
+  int ref_index, ref_num = 0;
+  int row_offset_cand[2 * (MI_BLOCK_SIZE + 1)];
+  int col_offset_cand[2 * (MI_BLOCK_SIZE + 1)];
+  int offset_num = 0, i, switchflag;
+  int is_sec_rec = is_second_rec(mi_row, mi_col, bsize);
+
+  if (is_sec_rec != 2) {
+    row_offset_cand[offset_num] = -1; col_offset_cand[offset_num] = 0;
+    offset_num++;
+  }
+  if (is_sec_rec != 1) {
+    row_offset_cand[offset_num] = bh / (2 * MI_SIZE);
+    col_offset_cand[offset_num] = -1;
+    offset_num++;
+  }
+
+  row_offset = bh / MI_SIZE - 1;
+  col_offset = 1;
+  if (is_sec_rec < 2)
+    switchflag = 1;
+  else
+    switchflag = 0;
+  while ((is_sec_rec == 0 && ((row_offset >=0) ||
+                              col_offset < (bw / MI_SIZE + 1))) ||
+         (is_sec_rec == 1 && col_offset < (bw / MI_SIZE + 1)) ||
+         (is_sec_rec == 2 && row_offset >=0)) {
+    switch (switchflag) {
+      case 0:
+        if (row_offset >= 0) {
+          if (row_offset != bh / (2 * MI_SIZE)) {
+            row_offset_cand[offset_num] = row_offset;
+            col_offset_cand[offset_num] = -1;
+            offset_num++;
+          }
+          row_offset--;
+        }
+        break;
+      case 1:
+        if (col_offset < (bw / MI_SIZE + 1)) {
+          row_offset_cand[offset_num] = -1;
+          col_offset_cand[offset_num] = col_offset;
+          offset_num++;
+          col_offset++;
+        }
+        break;
+      default:
+        assert(0);
+    }
+    if (is_sec_rec == 0)
+      switchflag = 1 - switchflag;
+  }
+  row_offset_cand[offset_num] = -1;
+  col_offset_cand[offset_num] = -1;
+  offset_num++;
+
+  for (i = 0; i < offset_num; i++) {
+    row_offset = row_offset_cand[i];
+    col_offset = col_offset_cand[i];
+    if ((col_offset < (bw / MI_SIZE) ||
+        (col_offset == (bw / MI_SIZE) && is_right_available(bsize,
+#if CONFIG_EXT_PARTITION
+                                                      partition,
+#endif
+                                                      mi_row, mi_col)))
+        && check_inside(tile, mi_row + row_offset, mi_col + col_offset)) {
+      mi_offset = row_offset * cm->mi_stride + col_offset;
+      ref_mbmi = &xd->mi[mi_offset].src_mi->mbmi;
+      if (is_inter_block(ref_mbmi)) {
+        for (ref_index = 0; ref_index < ref_num; ref_index++) {
+          if (compare_interinfo(ref_mbmi, ref_list[ref_index]))
+            break;
+        }
+        if (ref_index == ref_num) {
+          ref_list[ref_num] = ref_mbmi;
+          ref_num++;
+        }
+      }
+    }
+  }
+  return ref_num;
+}
+#endif  // CONFIG_COPY_MODE
+
+#if CONFIG_INTRABC
+void vp9_find_ref_dv(int_mv *ref_dv, int mi_row, int mi_col) {
+  (void) mi_col;
+  if (mi_row < 8) {
+    ref_dv->as_mv.row = 0;
+    ref_dv->as_mv.col = -8 * 8;
+  } else {
+    ref_dv->as_mv.row = -8 * 8;
+    ref_dv->as_mv.col = 0;
+  }
+}
+#endif  // CONFIG_INTRABC
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@ -22,6 +22,9 @@ extern "C" {
                                VP9_INTERP_EXTEND) << 3)

 #define MVREF_NEIGHBOURS 8
+#if CONFIG_NEWMVREF
+#define MAX_ZONES 2
+#endif  // CONFIG_NEWMVREF

 typedef struct position {
  int row;
@ -55,10 +58,25 @@ static const int mode_2_counter[MB_MODE_COUNT] = {
  9,  // D207_PRED
  9,  // D63_PRED
  9,  // TM_PRED
+#if CONFIG_INTRABC
+  9,  // NEWDV
+#endif  // CONFIG_INTRABC
  0,  // NEARESTMV
  0,  // NEARMV
  3,  // ZEROMV
  1,  // NEWMV
+#if CONFIG_NEW_INTER
+  1,  // NEW2MV
+  0,  // NEAREST_NEARESTMV
+  0,  // NEAREST_NEARMV
+  0,  // NEAR_NEARESTMV
+  1,  // NEAREST_NEWMV
+  1,  // NEW_NEARESTMV
+  1,  // NEAR_NEWMV
+  1,  // NEW_NEARMV
+  3,  // ZERO_ZEROMV
+  1,  // NEW_NEWMV
+#endif  // CONFIG_NEW_INTER
 };

 // There are 3^3 different combinations of 3 counts that can be either 0,1 or
@ -112,7 +130,15 @@ static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
  // 64X32
  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
  // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}},
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  // 64x128
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 128x64
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 128x128
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}},
+#endif
 };

 static const int idx_n_column_to_subblock[4][2] = {
@ -122,8 +148,92 @@ static const int idx_n_column_to_subblock[4][2] = {
  {3, 3}
 };

+#if CONFIG_NEWMVREF
+static const POSITION mv_ref_blocks_8x8[MAX_ZONES][MVREF_NEIGHBOURS] = {
+  {  // 8X8, Zone I,  where top right neighbors are available
+    {-1,  0}, { 0, -1}, {-1,  1}, {-1, -1},  // nearest neighboring blocks
+    {-2,  0}, { 0, -2}, {-2, -1}, {-1, -2}
+  },
+  {  // 8X8, Zone II, where no top right neighbor is available
+    {-1,  0}, { 0, -1}, {-1, -1},            // nearest neighboring blocks
+    {-2,  0}, { 0, -2}, {-2, -1}, {-1, -2}, {-2, -2}
+  }
+};
+
+static const int mv_ref_topright_avail_8x8[8][8] = {
+  {1, 1, 1, 1, 1, 1, 1, 1},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {1, 1, 1, 0, 1, 1, 1, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {1, 1, 1, 1, 1, 1, 1, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {1, 1, 1, 0, 1, 1, 1, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0}
+};
+
+static const int idx_to_subblock_top_left[12][2][3] = {
+  {  // 4x4 subblock 0 (current)
+    {2, 0, 2},  // top:  4x4, 4x8, 8x4
+    {1, 1, 0}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 4x4 subblock 1 (current)
+    {3, 1, 2},  // top:  4x4, 4x8, 8x4
+    {1, 1, 0}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 4x4 subblock 2 (current)
+    {2, 0, 2},  // top:  4x4, 4x8, 8x4
+    {3, 1, 2}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 4x4 subblock 3 (current)
+    {3, 1, 2},  // top:  4x4, 4x8, 8x4
+    {3, 1, 2}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 4x8 subblock 0 (current)
+    { 2, 0,  2},  // top:  4x4, 4x8, 8x4
+    {-1, 1, -1}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 4x8 subblock 1 (current)
+    { 3, 1,  2},  // top:  4x4, 4x8, 8x4
+    {-1, 1, -1}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 4x8 subblock 2 (current)
+    { 2, 0,  2},  // top:  4x4, 4x8, 8x4
+    {-1, 1, -1}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 4x8 subblock 3 (current)
+    { 3, 1,  2},  // top:  4x4, 4x8, 8x4
+    {-1, 1, -1}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 8x4 subblock 0 (current)
+    {-1, -1, 2},  // top:  4x4, 4x8, 8x4
+    { 1,  1, 0}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 8x4 subblock 1 (current)
+    {-1, -1, 2},  // top:  4x4, 4x8, 8x4
+    { 1,  1, 0}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 8x4 subblock 2 (current)
+    {-1, -1, 2},  // top:  4x4, 4x8, 8x4
+    { 3,  1, 2}   // left: 4x4, 4x8, 8x4
+  },
+  {  // 8x4 subblock 3 (current)
+    {-1, -1, 2},  // top:  4x4, 4x8, 8x4
+    { 3,  1, 2}   // left: 4x4, 4x8, 8x4
+  }
+};
+
+static const int idx_to_subblock_topright_topleft[2][3] = {
+  {2, 0, 2},  // top-right: 4x4, 4x8, 8x4
+  {3, 1, 2}   // top-left:  4x4, 4x8, 8x4
+};
+#endif  // CONFIG_NEWMVREF
+
 // clamp_mv_ref
+#if CONFIG_EXT_CODING_UNIT_SIZE
+#define MV_BORDER (32 << 3)  // Allow 32 pels in 1/8th pel units
+#else
 #define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+#endif

 static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
@ -142,7 +252,6 @@ static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
          : candidate->mbmi.mv[which_mv];
 }

-
 // Performs mv sign inversion if indicated by the reference frame combination.
 static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
                              const MV_REFERENCE_FRAME this_ref_frame,
@ -190,12 +299,56 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 static INLINE int is_inside(const TileInfo *const tile,
                            int mi_col, int mi_row, int mi_rows,
                            const POSITION *mi_pos) {
+#if CONFIG_ROW_TILE
+  (void) mi_rows;
+  return !(mi_row + mi_pos->row < tile->mi_row_start ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= tile->mi_row_end ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+#else
  return !(mi_row + mi_pos->row < 0 ||
           mi_col + mi_pos->col < tile->mi_col_start ||
           mi_row + mi_pos->row >= mi_rows ||
           mi_col + mi_pos->col >= tile->mi_col_end);
+#endif
 }

+#if CONFIG_NEWMVREF
+// This macro is used to add a motion vector as a candidate for the mv ref if
+// it isn't already taken. If it's the third motion vector, it will also skip
+// all additional processing and jump to done!
+#define ADD_MV_REF_CANDIDATE(mv) \
+  do { \
+    if (refmv_count) { \
+      if (refmv_count == 1 && \
+          (mv).as_int != mv_ref_candidates[0].as_int) { \
+        mv_ref_candidates[refmv_count++] = (mv); \
+      } else if (refmv_count == 2 && \
+                 (mv).as_int != mv_ref_candidates[0].as_int && \
+                 (mv).as_int != mv_ref_candidates[1].as_int) { \
+        mv_ref_candidates[refmv_count] = (mv); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_candidates[refmv_count++] = (mv); \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv as candidate.
+#define IF_DIFF_REF_FRAME_ADD_MV_CANDIDATE(mbmi) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_CANDIDATE(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_CANDIDATE(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+    } \
+  } while (0)
+#endif  // CONFIG_NEWMVREF
+
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
@ -204,11 +357,29 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }

+#if CONFIG_NEW_INTER
+// This function keeps a mode count for a given MB/SB
+void vp9_update_mv_context(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                           const TileInfo *const tile,
+                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                           int_mv *mv_ref_list,
+                           int block, int mi_row, int mi_col);
+#endif  // CONFIG_NEW_INTER
+
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                      const TileInfo *const tile,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                      int_mv *mv_ref_list, int mi_row, int mi_col);

+static INLINE void vp9_lower_mv_precision(MV *mv, const int usehp) {
+  if (!usehp) {
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
 // score to use as ref motion vector
@ -218,8 +389,25 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   const TileInfo *const tile,
                                   int block, int ref, int mi_row, int mi_col,
+#if CONFIG_NEW_INTER
+                                   int_mv *mv_list,
+#endif  // CONFIG_NEW_INTER
                                   int_mv *nearest, int_mv *near);

+#if CONFIG_COPY_MODE
+int vp9_construct_ref_inter_list(VP9_COMMON *cm,  MACROBLOCKD *xd,
+                                 const TileInfo *const tile,
+                                 BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION
+                                 PARTITION_TYPE partition,
+#endif
+                                 int mi_row, int mi_col,
+                                 MB_MODE_INFO *ref_list[18]);
+#endif  // CONFIG_COPY_MODE
+
+#if CONFIG_INTRABC
+void vp9_find_ref_dv(int_mv *ref_dv, int mi_row, int mi_col);
+#endif  // CONFIOG_INTRABC
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@ -30,7 +30,11 @@
 extern "C" {
 #endif

+#if CONFIG_MULTI_REF
+#define REFS_PER_FRAME 6
+#else  // CONFIG_MULTI_REF
 #define REFS_PER_FRAME 3
+#endif  // CONFIG_MULTI_REF

 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
@ -65,11 +69,29 @@ typedef struct {

 typedef struct VP9Common {
  struct vpx_internal_error_info  error;
-
  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+#endif  // CONFIG_NEW_QUANT

-  COLOR_SPACE color_space;
+#if CONFIG_TX_SKIP
+  DECLARE_ALIGNED(16, int16_t, y_dequant_pxd[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant_pxd[QINDEX_RANGE][8]);
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  y_dequant_val_nuq_pxd[QUANT_PROFILES][QINDEX_RANGE]
+                                       [COEF_BANDS]);
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  uv_dequant_val_nuq_pxd[QUANT_PROFILES][QINDEX_RANGE]
+                                        [COEF_BANDS]);
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_TX_SKIP
+
+  vpx_color_space_t color_space;

  int width;
  int height;
@ -103,9 +125,18 @@ typedef struct VP9Common {
  int new_fb_idx;

  YV12_BUFFER_CONFIG post_proc_buffer;
+#if CONFIG_LOOP_POSTFILTER
+  YV12_BUFFER_CONFIG tmp_loop_buf;
+#endif

  FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
  FRAME_TYPE frame_type;
+#if CONFIG_MULTI_REF
+  // frame type for the frame before the last
+  FRAME_TYPE last2_frame_type;
+  // frame type for the frame two frames before the last
+  FRAME_TYPE last3_frame_type;
+#endif  // CONFIG_MULTI_REF

  int show_frame;
  int last_show_frame;
@ -166,7 +197,11 @@ typedef struct VP9Common {
  // Context probabilities for reference frame prediction
  int allow_comp_inter_inter;
  MV_REFERENCE_FRAME comp_fixed_ref;
+#if CONFIG_MULTI_REF
+  MV_REFERENCE_FRAME comp_var_ref[5];
+#else  // CONFIG_MULTI_REF
  MV_REFERENCE_FRAME comp_var_ref[2];
+#endif  // CONFIG_MULTI_REF
  REFERENCE_MODE reference_mode;

  FRAME_CONTEXT fc;  /* this frame entropy */
@ -189,6 +224,8 @@ typedef struct VP9Common {
  int frame_parallel_decoding_mode;

  int log2_tile_cols, log2_tile_rows;
+  int tile_cols, tile_rows;
+  int tile_width, tile_height;

  // Private data associated with the frame buffer callbacks.
  void *cb_priv;
@ -200,6 +237,33 @@ typedef struct VP9Common {

  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;
+
+#if CONFIG_PALETTE
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t current_palette_colors[PALETTE_BUF_SIZE];
+#else
+  uint8_t current_palette_colors[PALETTE_BUF_SIZE];
+#endif
+  int current_palette_size;
+  int current_palette_count[PALETTE_BUF_SIZE];
+  int allow_palette_mode;
+  int palette_counter;
+  int palette_blocks_signalled;
+#endif  // CONFIG_PALETTE
+#if CONFIG_INTRABC
+  int allow_intrabc_mode;
+  int intrabc_counter;
+  int intrabc_blocks_signalled;
+#endif  // CONFIG_INTRABC
+#if CONFIG_GLOBAL_MOTION
+  int num_global_motion[MAX_REF_FRAMES];
+  Global_Motion_Params global_motion[MAX_REF_FRAMES][MAX_GLOBAL_MOTION_MODELS];
+#endif
+
+#if CONFIG_ROW_TILE
+  int tile_size_bytes;
+  int tile_col_size_bytes;
+#endif
 } VP9_COMMON;

 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
@ -250,6 +314,12 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
  }

+#if CONFIG_PALETTE
+  for (i = 0; i < 2; ++i) {
+    xd->plane[i].color_index_map = xd->color_index_map[i];
+  }
+#endif
+
  xd->above_seg_context = cm->above_seg_context;
  xd->mi_stride = cm->mi_stride;
 }
@ -266,7 +336,7 @@ static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,

 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
  const int above_idx = mi_col * 2;
-  const int left_idx = (mi_row * 2) & 15;
+  const int left_idx = (mi_row * 2) & MI_MASK_2;
  int i;
  for (i = 0; i < MAX_MB_PLANE; ++i) {
    struct macroblockd_plane *const pd = &xd->plane[i];
@ -280,7 +350,7 @@ static INLINE int calc_mi_size(int len) {
  return len + MI_BLOCK_SIZE;
 }

-static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                  int mi_row, int bh,
                                  int mi_col, int bw,
                                  int mi_rows, int mi_cols) {
@ -290,7 +360,11 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
  xd->mb_to_right_edge  = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;

  // Are edges available for intra prediction?
+#if CONFIG_ROW_TILE
+  xd->up_available    = (mi_row > tile->mi_row_start);
+#else
  xd->up_available    = (mi_row != 0);
+#endif
  xd->left_available  = (mi_col > tile->mi_col_start);
 }

@ -311,7 +385,12 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
                                            BLOCK_SIZE bsize) {
  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
-
+#if CONFIG_EXT_PARTITION
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  vpx_memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  vpx_memset(left_ctx, partition_context_lookup[subsize].left, bh);
+#else
  // num_4x4_blocks_wide_lookup[bsize] / 2
  const int bs = num_8x8_blocks_wide_lookup[bsize];

@ -320,8 +399,50 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
  // bits of smaller block sizes to be zero.
  vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs);
  vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
+#endif
 }

+#if CONFIG_EXT_PARTITION
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+    BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8)
+          break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default:
+        assert(0 && "Invalid partition type");
+    }
+  }
+}
+#endif
+
 static INLINE int partition_plane_context(const MACROBLOCKD *xd,
                                          int mi_row, int mi_col,
                                          BLOCK_SIZE bsize) {
@ -345,6 +466,18 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd,
  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }

+static INLINE int16_t vp9_get_quant(VP9_COMMON *const cm,
+                                    int qindex, int isuv, int isac) {
+  int quant;
+  if (!isuv) {
+    quant = isac == 0 ? vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth)
+        : vp9_ac_quant(qindex, 0, cm->bit_depth);
+  } else {
+    quant = isac == 0 ? vp9_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth)
+        : vp9_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
+  }
+  return quant;
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_palette.c
+++ b/vp9/common/vp9_palette.c
@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "vp9/common/vp9_palette.h"
+
+#if CONFIG_PALETTE
+void vp9_insertion_sort(double *data, int n) {
+  int i, j, k;
+  double val;
+
+  if (n <= 1)
+    return;
+
+  for (i = 1; i < n; i++) {
+    val = data[i];
+    j = 0;
+    while (val > data[j] && j < i)
+      j++;
+
+    if (j == i)
+      continue;
+
+    for (k = i; k > j; k--)
+      data[k] = data[k - 1];
+    data[j] = val;
+  }
+}
+
+int vp9_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+  int n = 0, r, c, i, val_count[256];
+  uint8_t val;
+  vpx_memset(val_count, 0, sizeof(val_count));
+
+  for (r = 0; r < rows; r++) {
+      for (c = 0; c < cols; c++) {
+        val = src[r * stride + c];
+        val_count[val]++;
+      }
+    }
+
+    for (i = 0; i < 256; i++) {
+      if (val_count[i]) {
+        n++;
+      }
+    }
+
+    return n;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth) {
+  int n = 0, r, c, i;
+  uint16_t val;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  int* val_count = vpx_calloc(1 << bit_depth, sizeof(*val_count));
+
+  for (r = 0; r < rows; r++) {
+      for (c = 0; c < cols; c++) {
+        val = src[r * stride + c];
+        val_count[val]++;
+      }
+    }
+
+    for (i = 0; i < (1 << bit_depth); i++) {
+      if (val_count[i]) {
+        n++;
+      }
+    }
+
+    vpx_free(val_count);
+
+    return n;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_palette_color_insertion(uint16_t *old_colors, int *m, int *count,
+                                 const MB_MODE_INFO *mbmi) {
+  const uint16_t *new_colors = mbmi->palette_literal_colors;
+  uint16_t val;
+#else
+void vp9_palette_color_insertion(uint8_t *old_colors, int *m, int *count,
+                                 const MB_MODE_INFO *mbmi) {
+  const uint8_t *new_colors = mbmi->palette_literal_colors;
+  uint8_t val;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int k = *m, n = mbmi->palette_literal_size;
+  int i, j, l, min_idx = -1;
+
+  if (mbmi->palette_indexed_size > 0) {
+    for (i = 0; i < mbmi->palette_indexed_size; i++)
+      count[mbmi->palette_indexed_colors[i]] +=
+          (8 - abs(mbmi->palette_color_delta[i]));
+  }
+
+  i = 0;
+  while (i < k) {
+    count[i] -= 1;
+    i++;
+  }
+
+  if (n <= 0)
+    return;
+
+  for (i = 0; i < n; i++) {
+    val = new_colors[i];
+    j = 0;
+    while (val != old_colors[j] && j < k)
+      j++;
+    if (j < k && val == old_colors[j]) {
+      count[j] += 8;
+      continue;
+    }
+
+    if (k + 1 > PALETTE_BUF_SIZE) {
+      min_idx = 0;
+      for (l = 1; l < k; l++)
+        if (count[l] < count[min_idx])
+          min_idx = l;
+      old_colors[min_idx] = val;
+      count[min_idx] = 8;
+    } else {
+      old_colors[k] = val;
+      count[k] = 8;
+      k++;
+    }
+  }
+
+  *m = k;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_palette_color_lookup(uint16_t *dic, int n, uint16_t val, int bits) {
+#else
+int vp9_palette_color_lookup(uint8_t *dic, int n, uint8_t val, int bits) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int j, min, arg_min = 0, i = 1;
+
+  if (n < 1)
+    return -1;
+
+  min = abs(val - dic[0]);
+  arg_min = 0;
+  while (i < n) {
+    j = abs(val - dic[i]);
+    if (j < min) {
+      min = j;
+      arg_min = i;
+    }
+    i++;
+  }
+
+  if (min < (1 << bits))
+    return arg_min;
+  else
+    return -1;
+}
+
+int vp9_ceil_log2(int n) {
+  int i = 1, p = 2;
+  while (p < n) {
+    i++;
+    p = p << 1;
+  }
+
+  return i;
+}
+
+static double calc_dist(const double *p1, const double *p2, int dim) {
+  double dist = 0;
+  int i = 0;
+
+  for (i = 0; i < dim; i++) {
+    dist = dist + (p1[i] - p2[i]) * (p1[i] - p2[i]);
+  }
+  return dist;
+}
+
+void vp9_calc_indices(const double *data, const double *centroids, int *indices,
+                      int n, int k, int dim) {
+  int i, j;
+  double min_dist, this_dist;
+
+  for (i = 0; i < n; i++) {
+    min_dist = calc_dist(data + i * dim, centroids, dim);
+    indices[i] = 0;
+    for (j = 1; j < k; j++) {
+      this_dist = calc_dist(data + i * dim, centroids + j * dim, dim);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+  }
+}
+
+static void calc_centroids(const double *data, double *centroids,
+                           const int *indices, int n, int k, int dim) {
+  int i, j, index;
+  int count[256];
+
+  srand((unsigned int) data[0]);
+  vpx_memset(count, 0, sizeof(count[0]) * k);
+  vpx_memset(centroids, 0, sizeof(centroids[0]) * k * dim);
+
+  for (i = 0; i < n; i++) {
+    index = indices[i];
+    count[index]++;
+    for (j = 0; j < dim; j++) {
+      centroids[index * dim + j] += data[i * dim + j];
+    }
+  }
+
+  for (i = 0; i < k; i++) {
+    if (count[i] == 0) {
+      vpx_memcpy(centroids + i * dim, data + (rand() % n) * dim,
+                 sizeof(centroids[0]) * dim);
+    } else {
+      const double norm = 1.0 / count[i];
+      for (j = 0; j < dim; j++)
+        centroids[i * dim + j] *= norm;
+    }
+  }
+}
+
+static double calc_total_dist(const double *data, const double *centroids,
+                              const int *indices, int n, int k, int dim) {
+  double dist = 0;
+  int i;
+  (void) k;
+
+  for (i = 0; i < n; i++) {
+    dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim);
+  }
+
+  return dist;
+}
+
+int vp9_k_means(const double *data, double *centroids, int *indices,
+                int n, int k, int dim, int max_itr) {
+  int i = 0;
+  int *pre_indices;
+  double pre_total_dist, cur_total_dist;
+  double pre_centroids[256];
+
+  pre_indices = vpx_memalign(16, n * sizeof(indices[0]));
+  vp9_calc_indices(data, centroids, indices, n, k, dim);
+  pre_total_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+  vpx_memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+  vpx_memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+  while (i < max_itr) {
+    calc_centroids(data, centroids, indices, n, k, dim);
+    vp9_calc_indices(data, centroids, indices, n, k, dim);
+    cur_total_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+
+    if (cur_total_dist > pre_total_dist) {
+      vpx_memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
+      vpx_memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      break;
+    }
+    if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
+      break;
+
+    vpx_memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+    vpx_memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+    pre_total_dist = cur_total_dist;
+    i++;
+  }
+
+  vpx_free(pre_indices);
+  return i;
+}
+
+void vp9_update_palette_counts(FRAME_COUNTS *counts, const MB_MODE_INFO *mbmi,
+                               BLOCK_SIZE bsize, int palette_ctx) {
+  int idx = bsize - BLOCK_8X8;
+
+  counts->y_palette_enabled[idx][palette_ctx][mbmi->palette_enabled[0]]++;
+  counts->uv_palette_enabled[mbmi->palette_enabled[0]]
+                            [mbmi->palette_enabled[1]]++;
+  if (mbmi->palette_enabled[0])
+    counts->y_palette_size[idx][mbmi->palette_size[0] - 2]++;
+  if (mbmi->palette_enabled[1])
+    counts->uv_palette_size[idx][mbmi->palette_size[1] - 2]++;
+}
+
+static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
+    3993,  4235,  4378,  4380,  // (3, 0, 0, 0), (3, 2, 0, 0),
+                                // (3, 3, 2, 0), (3, 3, 2, 2),
+    5720,  6655,  7018,  7040,  // (4, 3, 3, 0), (5, 0, 0, 0),
+                                // (5, 3, 0, 0), (5, 3, 2, 0),
+    7260,  8228,  8250,  8470,  // (5, 5, 0, 0), (6, 2, 0, 0),
+                                // (6, 2, 2, 0), (6, 4, 0, 0),
+    9680, 10648, 10890, 13310   // (7, 3, 0, 0), (8, 0, 0, 0),
+                                // (8, 2, 0, 0), (10, 0, 0, 0)
+};
+
+int vp9_get_palette_color_context(const uint8_t *color_map, int cols,
+                                  int r, int c, int n, int *color_order) {
+  int i, j, max, max_idx, temp;
+  int scores[PALETTE_MAX_SIZE + 10];
+  int weights[4] = {3, 2, 3, 2};
+  int color_ctx = 0;
+  int color_neighbors[4];
+
+  assert(n <= PALETTE_MAX_SIZE);
+
+  if (c - 1 >= 0)
+    color_neighbors[0] = color_map[r * cols + c - 1];
+  else
+    color_neighbors[0] = -1;
+  if (c - 1 >= 0 && r - 1 >= 0)
+    color_neighbors[1] = color_map[(r - 1) * cols + c - 1];
+  else
+    color_neighbors[1] = -1;
+  if (r - 1 >= 0)
+    color_neighbors[2] = color_map[(r - 1) * cols + c];
+  else
+    color_neighbors[2] = -1;
+  if (r - 1 >= 0 && c + 1 <= cols - 1)
+    color_neighbors[3] = color_map[(r - 1) * cols + c + 1];
+  else
+    color_neighbors[3] = -1;
+
+  for (i = 0; i < PALETTE_MAX_SIZE; i++)
+    color_order[i] = i;
+  memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
+  for (i = 0; i < 4; i++) {
+    if (color_neighbors[i] >= 0)
+      scores[color_neighbors[i]] += weights[i];
+  }
+
+  for (i = 0; i < 4; i++) {
+    max = scores[i];
+    max_idx = i;
+    j = i + 1;
+    while (j < n) {
+      if (scores[j] > max) {
+        max = scores[j];
+        max_idx = j;
+      }
+      j++;
+    }
+
+    if (max_idx != i) {
+      temp = scores[i];
+      scores[i] = scores[max_idx];
+      scores[max_idx] = temp;
+
+      temp = color_order[i];
+      color_order[i] = color_order[max_idx];
+      color_order[max_idx] = temp;
+    }
+  }
+
+  for (i = 0; i < 4; i++)
+    color_ctx = color_ctx * 11 + scores[i];
+
+  for (i = 0; i < PALETTE_COLOR_CONTEXTS; i++)
+    if (color_ctx == palette_color_context_lookup[i]) {
+      color_ctx = i;
+      break;
+    }
+
+  return color_ctx;
+}
+#endif  // CONFIG_PALETTE
--- a/vp9/common/vp9_palette.h
+++ b/vp9/common/vp9_palette.h
@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_PALETTE_H_
+#define VP9_COMMON_VP9_PALETTE_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_entropymode.h"
+
+#if CONFIG_PALETTE
+int vp9_count_colors(const uint8_t *src, int stride, int rows, int cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth);
+void vp9_palette_color_insertion(uint16_t *old_colors, int *m, int *count,
+                                 const MB_MODE_INFO *mbmi);
+int vp9_palette_color_lookup(uint16_t *dic, int n, uint16_t val, int bits);
+#else
+void vp9_palette_color_insertion(uint8_t *old_colors, int *m, int *count,
+                                 const MB_MODE_INFO *mbmi);
+int vp9_palette_color_lookup(uint8_t *dic, int n, uint8_t val, int bits);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+void vp9_insertion_sort(double *data, int n);
+int vp9_ceil_log2(int n);
+int vp9_k_means(const double *data, double *centroids, int *indices,
+                int n, int k, int dim, int max_itr);
+void vp9_calc_indices(const double *data, const double *centroids, int *indices,
+                      int n, int k, int dim);
+void vp9_update_palette_counts(FRAME_COUNTS *counts, const MB_MODE_INFO *mbmi,
+                               BLOCK_SIZE bsize, int palette_ctx);
+int vp9_get_palette_color_context(const uint8_t *color_map, int cols,
+                                  int r, int c, int n, int *color_order);
+#endif
+
+#endif  // VP9_COMMON_VP9_PALETTE_H_
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@ -109,6 +109,420 @@ int vp9_get_reference_mode_context(const VP9_COMMON *cm,
  return ctx;
 }

+#if CONFIG_MULTI_REF
+
+#define CHECK_LAST_OR_LAST2(ref_frame) \
+  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME))
+
+#define CHECK_GOLDEN_LAST3_LAST4(ref_frame) \
+  ((ref_frame == GOLDEN_FRAME) || (ref_frame == LAST3_FRAME) || \
+  (ref_frame == LAST4_FRAME))
+
+// TODO(zoeliu): Would like to create a master function.
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is either
+// GOLDEN/LAST3/LAST4, or LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is either
+//               GOLDEN_FRAME/LAST3_FRAME/LAST4_FRAME.
+int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int above_in_image = above_mbmi != NULL;
+  const int left_in_image = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 +
+            2 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[0]));
+      else  // comp pred (1/3)
+        pred_context = 1 +
+            2 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[var_ref_idx]));
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && CHECK_GOLDEN_LAST3_LAST4(vrfa)) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((vrfa == ALTREF_FRAME && CHECK_LAST_OR_LAST2(vrfl)) ||
+            (vrfl == ALTREF_FRAME && CHECK_LAST_OR_LAST2(vrfa))) {
+          pred_context = 4;
+        } else if (vrfa == vrfl || (CHECK_LAST_OR_LAST2(vrfa) &&
+                                    CHECK_LAST_OR_LAST2(vrfl))) {
+          pred_context = 3;
+        } else {  // Either vrfa or vrfl is GOLDEN / LAST3 / LAST4
+          // NOTE(zoeliu): Following assert may be removed once confirmed.
+          assert(CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                 CHECK_GOLDEN_LAST3_LAST4(vrfl));
+          pred_context = 1;
+        }
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (CHECK_GOLDEN_LAST3_LAST4(vrfc) && !CHECK_GOLDEN_LAST3_LAST4(rfs))
+          pred_context = 1;
+        else if (CHECK_GOLDEN_LAST3_LAST4(rfs) &&
+                 !CHECK_GOLDEN_LAST3_LAST4(vrfc))
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else {  // comp/comp
+        if ((CHECK_LAST_OR_LAST2(vrfa) && CHECK_LAST_OR_LAST2(vrfl))) {
+          pred_context = 4;
+        } else {
+          // NOTE(zoeliu): Following assert may be removed once confirmed.
+          assert(CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                 CHECK_GOLDEN_LAST3_LAST4(vrfl));
+          pred_context = 2;
+        }
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context =
+            4 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[var_ref_idx]));
+      else
+        pred_context = 3 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[0]));
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is LAST,
+// conditioning on that it is known either LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST_FRAME,
+// conditioning on it is either LAST_FRAME or LAST2_FRAME.
+int vp9_get_pred_context_comp_ref_p1(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int above_in_image = above_mbmi != NULL;
+  const int left_in_image = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
+                                != LAST_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && vrfa == LAST_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (vrfa == LAST_FRAME || vrfl == LAST_FRAME)
+          pred_context = 1;
+        else if (CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                 CHECK_GOLDEN_LAST3_LAST4(vrfl))
+          pred_context = 2 + (vrfa != vrfl);
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (vrfc == LAST_FRAME && rfs != LAST_FRAME)
+          pred_context = 1;
+        else if (rfs == LAST_FRAME && vrfc != LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (vrfc == LAST2_FRAME || CHECK_GOLDEN_LAST3_LAST4(rfs));
+      } else {  // comp/comp
+        if (vrfa == LAST_FRAME || vrfl == LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 + (CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                              CHECK_GOLDEN_LAST3_LAST4(vrfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] != LAST_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#define CHECK_LAST3_OR_LAST4(ref_frame) \
+  ((ref_frame == LAST3_FRAME) || (ref_frame == LAST4_FRAME))
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is GOLDEN,
+// conditioning on that it is known either GOLDEN/LAST3/LAST4.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME,
+// conditioning on it is either GOLDEN / LAST3 / LAST4.
+int vp9_get_pred_context_comp_ref_p2(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int above_in_image = above_mbmi != NULL;
+  const int left_in_image = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != GOLDEN_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[var_ref_idx] != GOLDEN_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && vrfa == GOLDEN_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (vrfa == GOLDEN_FRAME || vrfl == GOLDEN_FRAME)
+          pred_context = 1;
+        else if (CHECK_LAST_OR_LAST2(vrfa) || CHECK_LAST_OR_LAST2(vrfl))
+          pred_context = 2 + (vrfa != vrfl);
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (vrfc == GOLDEN_FRAME && rfs != GOLDEN_FRAME)
+          pred_context = 1;
+        else if (rfs == GOLDEN_FRAME && vrfc != GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_LAST3_OR_LAST4(vrfc) || CHECK_LAST_OR_LAST2(rfs));
+      } else {  // comp/comp
+        if (vrfa == GOLDEN_FRAME || vrfl == GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_LAST_OR_LAST2(vrfa) || CHECK_LAST_OR_LAST2(vrfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] != GOLDEN_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == GOLDEN_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#define CHECK_LAST_LAST2_GOLDEN(ref_frame) \
+  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME) || \
+  (ref_frame == GOLDEN_FRAME))
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is LAST3,
+// conditioning on that it is known either LAST3/LAST4.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST3_FRAME,
+// conditioning on it is either LAST3 / LAST4.
+int vp9_get_pred_context_comp_ref_p3(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int above_in_image = above_mbmi != NULL;
+  const int left_in_image = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST3_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[var_ref_idx] != LAST3_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && vrfa == LAST3_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (vrfa == LAST3_FRAME || vrfl == LAST3_FRAME)
+          pred_context = 1;
+        else if (CHECK_LAST_LAST2_GOLDEN(vrfa) || CHECK_LAST_LAST2_GOLDEN(vrfl))
+          pred_context = 2 + (vrfa != vrfl);
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (vrfc == LAST3_FRAME && rfs != LAST3_FRAME)
+          pred_context = 1;
+        else if (rfs == LAST3_FRAME && vrfc != LAST3_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (vrfc == LAST4_FRAME || CHECK_LAST_LAST2_GOLDEN(rfs));
+      } else {  // comp/comp
+        if (vrfa == LAST3_FRAME || vrfl == LAST3_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_LAST_LAST2_GOLDEN(vrfa) || CHECK_LAST_LAST2_GOLDEN(vrfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] != LAST3_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == LAST3_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_LAST_LAST2_GOLDEN(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#else  // CONFIG_MULTI_REF
+
 // Returns a context number for the given MB prediction signal
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                    const MACROBLOCKD *xd) {
@ -192,6 +606,479 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
  return pred_context;
 }

+#endif  // CONFIG_MULTI_REF
+
+#if CONFIG_MULTI_REF
+
+#define CHECK_LAST_LAST2_LAST3(ref_frame) \
+  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME) || \
+  (ref_frame == LAST3_FRAME))
+
+#define CHECK_GOLDEN_OR_ALTREF(ref_frame) \
+  ((ref_frame == GOLDEN_FRAME) || (ref_frame == ALTREF_FRAME))
+
+// For the bit to signal whether the single reference is a ALTREF_FRAME
+// or a GOLDEN_FRAME.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF/GOLDEN.
+int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]));
+      else
+        pred_context = 1 + (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
+                            !CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[1]));
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        pred_context = 1 + (!CHECK_GOLDEN_OR_ALTREF(above0) ||
+                            !CHECK_GOLDEN_OR_ALTREF(above1) ||
+                            !CHECK_GOLDEN_OR_ALTREF(left0) ||
+                            !CHECK_GOLDEN_OR_ALTREF(left1));
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (!CHECK_GOLDEN_OR_ALTREF(rfs))
+          pred_context = 3 + (!CHECK_GOLDEN_OR_ALTREF(crf1) ||
+                              !CHECK_GOLDEN_OR_ALTREF(crf2));
+        else
+          pred_context = !CHECK_GOLDEN_OR_ALTREF(crf1) ||
+                         !CHECK_GOLDEN_OR_ALTREF(crf2);
+      } else {
+        pred_context = 2 * (!CHECK_GOLDEN_OR_ALTREF(above0)) +
+                       2 * (!CHECK_GOLDEN_OR_ALTREF(left0));
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {  // inter
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]));
+      else
+        pred_context = 1 + (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
+                            !CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[1]));
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// GOLDEN_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF_FRAME, conditioning
+// on it is either ALTREF_FRAME/GOLDEN_FRAME.
+
+
+
+int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+      } else {
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == GOLDEN_FRAME ||
+                              above1 == GOLDEN_FRAME ||
+                              left0 == GOLDEN_FRAME ||
+                              left1 == GOLDEN_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == GOLDEN_FRAME)
+          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      } else {
+        if (!CHECK_GOLDEN_OR_ALTREF(above0) && !CHECK_GOLDEN_OR_ALTREF(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_GOLDEN_OR_ALTREF(above0) ||
+                   !CHECK_GOLDEN_OR_ALTREF(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_GOLDEN_OR_ALTREF(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
+        } else {
+          pred_context = 2 * (above0 == GOLDEN_FRAME) +
+                         2 * (left0  == GOLDEN_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST3/LAST4 or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST3/LAST4, conditioning
+// on it is either LAST3/LAST4/LAST2/LAST.
+int vp9_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      } else {
+        pred_context = 1 +
+            2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                 CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (CHECK_LAST_OR_LAST2(above0) ||
+                              CHECK_LAST_OR_LAST2(above1) ||
+                              CHECK_LAST_OR_LAST2(left0) ||
+                              CHECK_LAST_OR_LAST2(left1));
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (CHECK_LAST_OR_LAST2(rfs))
+          pred_context = 3 + (CHECK_LAST_OR_LAST2(crf1) ||
+                              CHECK_LAST_OR_LAST2(crf2));
+        else if (rfs == LAST3_FRAME || rfs == LAST4_FRAME)
+          pred_context = (CHECK_LAST_OR_LAST2(crf1) ||
+                          CHECK_LAST_OR_LAST2(crf2));
+        else
+          pred_context = 1 + 2 * (CHECK_LAST_OR_LAST2(crf1) ||
+                                  CHECK_LAST_OR_LAST2(crf2));
+      } else {
+        if (CHECK_GOLDEN_OR_ALTREF(above0) && CHECK_GOLDEN_OR_ALTREF(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (CHECK_GOLDEN_OR_ALTREF(above0) ||
+                   CHECK_GOLDEN_OR_ALTREF(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              CHECK_GOLDEN_OR_ALTREF(above0) ? left0 : above0;
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge0);
+        } else {
+          pred_context = 2 * CHECK_LAST_OR_LAST2(above0) +
+                         2 * CHECK_LAST_OR_LAST2(left0);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
+    else
+      pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                          CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST2_FRAME, conditioning
+// on it is either LAST2_FRAME/LAST_FRAME.
+int vp9_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      } else {
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                 edge_mbmi->ref_frame[1] == LAST_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                              left0 == LAST_FRAME || left1 == LAST_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else if (rfs == LAST2_FRAME)
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {
+        if (!CHECK_LAST_OR_LAST2(above0) &&
+            !CHECK_LAST_OR_LAST2(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_LAST_OR_LAST2(above0) ||
+                   !CHECK_LAST_OR_LAST2(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_LAST_OR_LAST2(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST_FRAME);
+        } else {
+          pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST4_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST4_FRAME, conditioning
+// on it is either LAST4_FRAME/LAST3_FRAME.
+int vp9_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_LAST3_OR_LAST4(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+      } else {
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                 edge_mbmi->ref_frame[1] == LAST3_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
+                              left0 == LAST3_FRAME || left1 == LAST3_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST3_FRAME)
+          pred_context = 3 + (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else if (rfs == LAST4_FRAME)
+          pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+      } else {
+        if (!CHECK_LAST3_OR_LAST4(above0) &&
+            !CHECK_LAST3_OR_LAST4(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_LAST3_OR_LAST4(above0) ||
+                   !CHECK_LAST3_OR_LAST4(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_LAST3_OR_LAST4(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST3_FRAME);
+        } else {
+          pred_context = 2 * (above0 == LAST3_FRAME) +
+                         2 * (left0 == LAST3_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_LAST3_OR_LAST4(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST3_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+#else  // CONFIG_MULTI_REF
+
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
  int pred_context;
  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
@ -343,6 +1230,9 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  return pred_context;
 }
+
+#endif  // CONFIG_MULTI_REF
+
 // Returns a context number for the given MB prediction signal
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
@ -383,3 +1273,47 @@ int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
  return segment_id;
 }
+
+#if CONFIG_COPY_MODE
+int vp9_get_copy_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+
+  if (has_above && has_left) {
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {
+      return 4;
+    } else if (above_intra || left_intra) {
+      return 3;
+    } else {
+      const int above_predict = above_mbmi->copy_mode != NOREF;
+      const int left_predict = left_mbmi->copy_mode != NOREF;
+      if (above_predict && left_predict)
+        return 0;
+      else if (above_predict || left_predict)
+        return 1;
+      else
+        return 2;
+    }
+  } else if (has_above || has_left) {
+    const MB_MODE_INFO *const ref_mbmi = has_above ? above_mbmi : left_mbmi;
+    const int ref_intra = !is_inter_block(ref_mbmi);
+
+    if (ref_intra) {
+      return 3;
+    } else {
+     const int ref_predict = ref_mbmi->copy_mode != NOREF;
+      if (ref_predict)
+        return 0;
+      else
+        return 1;
+    }
+  } else {
+    return 0;
+  }
+}
+#endif  // CONFIG_COPY_MODE
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@ -57,6 +57,54 @@ static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
  return cm->fc.skip_probs[vp9_get_skip_context(xd)];
 }

+#if CONFIG_SR_MODE
+#include "vp9/common/vp9_sr_txfm.h"
+static INLINE int vp9_get_sr_context(const MACROBLOCKD *xd,
+                                     BLOCK_SIZE bsize) {
+  TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int ctx;
+  (void)xd;
+
+  assert(max_tx_size >= MIN_SR_TX_SIZE &&
+         max_tx_size <= MAX_SR_TX_SIZE);
+  ctx = max_tx_size - MIN_SR_TX_SIZE;
+
+  return ctx;
+}
+
+static INLINE vp9_prob vp9_get_sr_prob(const VP9_COMMON *cm,
+                                       const MACROBLOCKD *xd,
+                                       BLOCK_SIZE bsize) {
+  int sr_ctx = vp9_get_sr_context(xd, bsize);
+  assert(sr_ctx >= 0 && sr_ctx < SR_CONTEXTS);
+  return cm->fc.sr_probs[sr_ctx];
+}
+
+#if SR_USE_MULTI_F
+static INLINE vp9_prob vp9_get_sr_usfilter_context(const MACROBLOCKD *xd) {
+  (void) xd;
+  return 0;
+
+  /*const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  int above_sr_ver =
+      (above_mi != NULL && above_mi->mbmi.sr && !above_mi->mbmi.skip) ?
+      idx_to_v(above_mi->mbmi.us_filter_idx) : SR_USFILTER_NUM_D;
+  int left_sr_hor =
+      (left_mi != NULL && left_mi->mbmi.sr && !left_mi->mbmi.skip) ?
+      idx_to_h(left_mi->mbmi.us_filter_idx) : SR_USFILTER_NUM_D;
+  return above_sr_ver * 3 + left_sr_hor;*/
+}
+
+static INLINE const vp9_prob * vp9_get_sr_usfilter_prob(const VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd) {
+  int sr_usfilter_ctx = vp9_get_sr_usfilter_context(xd);
+  assert(sr_usfilter_ctx >= 0 && sr_usfilter_ctx < SR_USFILTER_CONTEXTS);
+  return cm->fc.sr_usfilter_probs[sr_usfilter_ctx];
+}
+#endif  // SR_USE_MULTI_F
+#endif  // CONFIG_SR_MODE
+
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);

 int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
@ -79,23 +127,75 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
  const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc.comp_ref_prob[pred_context];
+  return cm->fc.comp_ref_probs[pred_context][0];
 }

+#if CONFIG_MULTI_REF
+int vp9_get_pred_context_comp_ref_p1(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p1(const VP9_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_comp_ref_p1(cm, xd);
+  return cm->fc.comp_ref_probs[pred_context][1];
+}
+
+int vp9_get_pred_context_comp_ref_p2(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p2(const VP9_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_comp_ref_p2(cm, xd);
+  return cm->fc.comp_ref_probs[pred_context][2];
+}
+
+int vp9_get_pred_context_comp_ref_p3(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p3(const VP9_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_comp_ref_p3(cm, xd);
+  return cm->fc.comp_ref_probs[pred_context][3];
+}
+#endif  // CONFIG_MULTI_REF
+
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);

 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                       const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
+  return cm->fc.single_ref_probs[vp9_get_pred_context_single_ref_p1(xd)][0];
 }

 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);

 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                       const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
+  return cm->fc.single_ref_probs[vp9_get_pred_context_single_ref_p2(xd)][1];
 }

+#if CONFIG_MULTI_REF
+int vp9_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_single_ref_p3(const VP9_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc.single_ref_probs[vp9_get_pred_context_single_ref_p3(xd)][2];
+}
+
+int vp9_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_single_ref_p4(const VP9_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc.single_ref_probs[vp9_get_pred_context_single_ref_p4(xd)][3];
+}
+
+int vp9_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_single_ref_p5(const VP9_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc.single_ref_probs[vp9_get_pred_context_single_ref_p5(xd)][4];
+}
+#endif  // CONFIG_MULTI_REF
+
 int vp9_get_tx_size_context(const MACROBLOCKD *xd);

 static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
@ -107,6 +207,10 @@ static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
      return tx_probs->p16x16[ctx];
    case TX_32X32:
      return tx_probs->p32x32[ctx];
+#if CONFIG_TX64X64
+    case TX_64X64:
+      return tx_probs->p64x64[ctx];
+#endif
    default:
      assert(0 && "Invalid max_tx_size.");
      return NULL;
@ -128,12 +232,41 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
      return tx_counts->p16x16[ctx];
    case TX_32X32:
      return tx_counts->p32x32[ctx];
+#if CONFIG_TX64X64
+    case TX_64X64:
+      return tx_counts->p64x64[ctx];
+#endif
    default:
      assert(0 && "Invalid max_tx_size.");
      return NULL;
  }
 }

+#if CONFIG_SR_MODE
+static INLINE unsigned int *get_real_tx_counts(TX_SIZE max_tx_size, int ctx,
+                                          struct tx_counts *tx_counts) {
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_counts->real_p8x8[ctx];
+    case TX_16X16:
+      return tx_counts->real_p16x16[ctx];
+    case TX_32X32:
+      return tx_counts->real_p32x32[ctx];
+#if CONFIG_TX64X64
+    case TX_64X64:
+      return tx_counts->real_p64x64[ctx];
+#endif  // CONFIG_TX64X64
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
+}
+#endif  // CONFIG_SR_MODE
+
+#if CONFIG_COPY_MODE
+int vp9_get_copy_mode_context(const MACROBLOCKD *xd);
+#endif  // CONFIG_COPY_MODE
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_qctx_token_probs.c
+++ b/vp9/common/vp9_qctx_token_probs.c
--- a/vp9/common/vp9_qctx_token_probs.h
+++ b/vp9/common/vp9_qctx_token_probs.h
@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_QCTX_TOKEN_PROBS_H_
+#define VP9_COMMON_VP9_QCTX_TOKEN_PROBS_H_
+
+#include "vp9/common/vp9_entropymode.h"
+
+#if CONFIG_QCTX_TPROBS
+#define QCTX_BINS_BITS 2
+extern const vp9_coeff_probs_model
+default_qctx_coef_probs[1 << QCTX_BINS_BITS][TX_SIZES][PLANE_TYPES];
+#endif  // CONFIG_QCTX_TPROBS
+
+#endif  // VP9_COMMON_VP9_QCTX_TOKEN_PROBS_H_
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@ -8,10 +8,190 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <stdio.h>
+#include <math.h>
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"

+#if CONFIG_TX_SKIP
+int tx_skip_q_thresh_inter = FOR_SCREEN_CONTENT ? 255 : 64;
+int tx_skip_q_thresh_intra = 255;
+#endif  // CONFIG_TX_SKIP
+
+#if CONFIG_NEW_QUANT
+// Bin widths expressed as a fraction over 128 of the quant stepsize,
+// for the quantization bins 0-4.
+// So a value x indicates the bin is actually factor x/128 of the
+// nominal quantization step.  For the zero bin, the width is only
+// for one side of zero, so the actual width is twice that.
+// There are four sets of values for 4 different quantizer ranges.
+//
+// TODO(debargha): Optimize these tables
+static const uint8_t vp9_nuq_knots_lossless[COEF_BANDS][NUQ_KNOTS] = {
+  {64, 128, 128},  // dc, band 0
+  {64, 128, 128},  // band 1
+  {64, 128, 128},  // band 2
+  {64, 128, 128},  // band 3
+  {64, 128, 128},  // band 4
+  {64, 128, 128},  // band 5
+#if CONFIG_TX_SKIP
+  {64, 128, 128},  // band 6
+#endif  // CONFIG_TX_SKIP
+};
+
+static const uint8_t vp9_nuq_knots[QUANT_PROFILES][COEF_BANDS][NUQ_KNOTS] = {
+  {
+    {86, 122, 134},  // dc, band 0
+    {78, 122, 134},  // band 1
+    {78, 122, 134},  // band 2
+    {84, 122, 133},  // band 3
+    {88, 122, 134},  // band 4
+    {88, 122, 134},  // band 5
+#if CONFIG_TX_SKIP
+    {86, 122, 128},  // band 6
+#endif  // CONFIG_TX_SKIP
+  },
+#if QUANT_PROFILES > 1
+  {
+    {86, 122, 134},  // dc, band 0
+    {78, 122, 134},  // band 1
+    {78, 122, 134},  // band 2
+    {84, 122, 134},  // band 3
+    {88, 122, 134},  // band 4
+    {88, 122, 134},  // band 5
+#if CONFIG_TX_SKIP
+    {86, 122, 128},  // band 6
+#endif  // CONFIG_TX_SKIP
+  },
+#if QUANT_PROFILES > 2
+  {
+    {86, 122, 134},  // dc, band 0
+    {78, 122, 135},  // band 1
+    {78, 122, 134},  // band 2
+    {84, 122, 133},  // band 3
+    {88, 122, 134},  // band 4
+    {88, 122, 134},  // band 5
+#if CONFIG_TX_SKIP
+    {86, 122, 128},  // band 6
+#endif  // CONFIG_TX_SKIP
+  }
+#endif  // QUANT_PROFILES > 2
+#endif  // QUANT_PROFILES > 1
+};
+
+static const uint8_t vp9_nuq_doff_lossless[COEF_BANDS] = { 0, 0, 0, 0, 0, 0,
+#if CONFIG_TX_SKIP
+    0
+#endif  // CONFIG_TX_SKIP
+};
+
+#if QUANT_PROFILES == 1
+static const uint8_t vp9_nuq_doff[QUANT_PROFILES][COEF_BANDS] = {
+  { 8, 15, 16, 22, 23, 24,     // dq_off_index = 0
+#if CONFIG_TX_SKIP
+    8
+#endif  // CONFIG_TX_SKIP
+  }
+};
+#elif QUANT_PROFILES == 2
+static const uint8_t vp9_nuq_doff[QUANT_PROFILES][COEF_BANDS] = {
+  { 8, 15, 16, 22, 23, 24,     // dq_off_index = 0
+#if CONFIG_TX_SKIP
+    8
+#endif  // CONFIG_TX_SKIP
+  },
+  { 13, 20, 21, 27, 28, 29,     // dq_off_index = 1
+#if CONFIG_TX_SKIP
+    8
+#endif  // CONFIG_TX_SKIP
+  },
+};
+#else  // QUANT_PROFILES == 3
+static const uint8_t vp9_nuq_doff[QUANT_PROFILES][COEF_BANDS] = {
+  { 6, 14, 15, 22, 23, 27,     // dq_off_index = 0
+#if CONFIG_TX_SKIP
+    8
+#endif  // CONFIG_TX_SKIP
+  },
+  { 6, 15, 17, 22, 23, 23,     // dq_off_index = 1
+#if CONFIG_TX_SKIP
+    8
+#endif  // CONFIG_TX_SKIP
+  },
+  { 6, 14, 16, 22, 23, 27,     // dq_off_index = 2
+#if CONFIG_TX_SKIP
+    8
+#endif  // CONFIG_TX_SKIP
+  }
+};
+#endif
+
+// Allow different quantization profiles in different q ranges,
+// to enable entropy-constraints in scalar quantization.
+
+static const uint8_t *get_nuq_knots(int lossless, int band, int dq_off_index) {
+  if (lossless)
+    return vp9_nuq_knots_lossless[band];
+  else
+    return vp9_nuq_knots[dq_off_index][band];
+}
+
+static INLINE int16_t quant_to_doff_fixed(int lossless, int band,
+                                          int dq_off_index) {
+  if (lossless)
+    return vp9_nuq_doff_lossless[band];
+  else
+    return vp9_nuq_doff[dq_off_index][band];
+}
+
+static INLINE void get_cumbins_nuq(int q, int lossless, int band,
+                                   tran_low_t *cumbins, int dq_off_index) {
+  const uint8_t *knots = get_nuq_knots(lossless, band, dq_off_index);
+  int16_t cumknots[NUQ_KNOTS];
+  int i;
+  cumknots[0] = knots[0];
+  for (i = 1; i < NUQ_KNOTS; ++i)
+    cumknots[i] = cumknots[i - 1] + knots[i];
+  for (i = 0; i < NUQ_KNOTS; ++i)
+    cumbins[i] = (cumknots[i] * q + 64) >> 7;
+}
+
+void vp9_get_dequant_val_nuq(int q, int lossless, int band,
+                             tran_low_t *dq, tran_low_t *cumbins,
+                             int dq_off_index) {
+  const uint8_t *knots = get_nuq_knots(lossless, band, dq_off_index);
+  tran_low_t cumbins_[NUQ_KNOTS], *cumbins_ptr;
+  tran_low_t doff;
+  int i;
+  cumbins_ptr = (cumbins ? cumbins : cumbins_);
+  get_cumbins_nuq(q, lossless, band, cumbins_ptr, dq_off_index);
+  dq[0] = 0;
+  for (i = 1; i < NUQ_KNOTS; ++i) {
+    const int16_t qstep = (knots[i] * q + 64) >> 7;
+    doff = quant_to_doff_fixed(lossless, band, dq_off_index);
+    doff = (2 * doff * qstep + q) / (2 * q);
+    dq[i] = cumbins_ptr[i - 1] + (((knots[i] - doff * 2) * q + 128) >> 8);
+  }
+  doff = quant_to_doff_fixed(lossless, band, dq_off_index);
+  dq[NUQ_KNOTS] =
+      cumbins_ptr[NUQ_KNOTS - 1] + (((64 - doff) * q + 64) >> 7);
+}
+
+tran_low_t vp9_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq) {
+  if (v <= NUQ_KNOTS)
+    return dq[v];
+  else
+    return dq[NUQ_KNOTS] + (v - NUQ_KNOTS) * q;
+}
+
+tran_low_t vp9_dequant_coeff_nuq(int v, int q, const tran_low_t *dq) {
+  tran_low_t dqmag = vp9_dequant_abscoeff_nuq(abs(v), q, dq);
+  return (v < 0 ? -dqmag : dqmag);
+}
+#endif  // CONFIG_NEW_QUANT
+
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
  4,       8,    8,    9,   10,   11,   12,   12,
  13,     14,   15,   16,   17,   18,   19,   19,
@ -275,4 +455,3 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id,
    return base_qindex;
  }
 }
-
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@ -11,7 +11,10 @@
 #ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
 #define VP9_COMMON_VP9_QUANT_COMMON_H_

+#include <stdio.h>
+
 #include "vpx/vpx_codec.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_seg_common.h"

 #ifdef __cplusplus
@ -22,6 +25,12 @@ extern "C" {
 #define MAXQ 255
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8
+#if CONFIG_TX_SKIP
+#define TX_SKIP_SHIFT_THRESH 0
+#define PXD_QUANT_INDEX 0
+extern int tx_skip_q_thresh_inter;
+extern int tx_skip_q_thresh_intra;
+#endif  // CONFIG_TX_SKIP

 int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
 int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
@ -29,6 +38,22 @@ int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
 int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                   int base_qindex);

+static INLINE int16_t vp9_round_factor_to_round(int16_t quant,
+                                                int16_t round_factor) {
+  return (round_factor * quant) >> 7;
+}
+
+#if CONFIG_NEW_QUANT
+#define NUQ_KNOTS 3
+typedef tran_low_t dequant_val_type_nuq[NUQ_KNOTS + 1];
+typedef tran_low_t cumbins_type_nuq[NUQ_KNOTS];
+void vp9_get_dequant_val_nuq(int q, int lossless, int band,
+                             tran_low_t *dq, tran_low_t *cumbins,
+                             int dq_off_index);
+tran_low_t vp9_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq);
+tran_low_t vp9_dequant_coeff_nuq(int v, int q, const tran_low_t *dq);
+#endif  // CONFIG_NEW_QUANT
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@ -30,6 +30,15 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                       BLOCK_SIZE bsize);

+#if CONFIG_SUPERTX
+void vp9_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize, int block);
+void vp9_dec_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, int block);
+#endif
+
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const MV *mv_q3,
@ -58,14 +67,18 @@ static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
 }

 static INLINE void setup_pred_plane(struct buf_2d *dst,
+                                    int width, int height,
                                    uint8_t *src, int stride,
                                    int mi_row, int mi_col,
                                    const struct scale_factors *scale,
                                    int subsampling_x, int subsampling_y) {
  const int x = (MI_SIZE * mi_col) >> subsampling_x;
  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  dst->buf0 = src;
  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
  dst->stride = stride;
+  dst->width = width;
+  dst->height = height;
 }

 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
@ -76,6 +89,66 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                          const struct scale_factors *sf);

+#if CONFIG_WEDGE_PARTITION
+
+#define MASK_MASTER_SIZE   (2 * CODING_UNIT_SIZE)
+#define MASK_MASTER_STRIDE (2 * CODING_UNIT_SIZE)
+
+void vp9_init_wedge_masks();
+
+const uint8_t *vp9_get_soft_mask(int wedge_index,
+                                 BLOCK_SIZE sb_type,
+                                 int h, int w);
+void vp9_generate_soft_mask(int wedge_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride);
+void vp9_generate_hard_mask(int wedge_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride);
+void vp9_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, int ref,
+    uint8_t *ext_dst[3], int ext_dst_stride[3]);
+void vp9_build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int mi_row, int mi_col,
+    uint8_t *ext_dst0[3], int ext_dst_stride0[3],
+    uint8_t *ext_dst1[3], int ext_dst_stride1[3]);
+#endif  // CONFIG_WEDGE_PARTITION
+
+#if CONFIG_SUPERTX
+
+struct macroblockd_plane;
+
+void vp9_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd,
+    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    const struct macroblockd_plane *pd, int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+    PARTITION_TYPE partition, int plane);
+
+#if CONFIG_WEDGE_PARTITION
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize);
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize);
+
+void vp9_build_inter_predictors_sb_sub8x8_extend(
+    MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, int block);
+void vp9_dec_build_inter_predictors_sb_sub8x8_extend(
+    MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, int block);
+
+#endif  // CONFIG_WEDGE_PARTITION
+#endif  // CONFIG_SUPERTX
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@ -22,9 +22,32 @@ void vp9_init_intra_predictors();

 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, PREDICTION_MODE mode,
+#if CONFIG_FILTERINTRA
+                             int filterbit,
+#endif
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride,
                             int aoff, int loff, int plane);
+
+#if CONFIG_INTERINTRA
+void vp9_build_interintra_predictors(MACROBLOCKD *xd,
+                                     uint8_t *ypred,
+                                     uint8_t *upred,
+                                     uint8_t *vpred,
+                                     int ystride,
+                                     int ustride,
+                                     int vstride,
+                                     BLOCK_SIZE bsize);
+void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                         uint8_t *ypred,
+                                         int ystride,
+                                         BLOCK_SIZE bsize);
+void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          uint8_t *vpred,
+                                          int ustride, int vstride,
+                                          BLOCK_SIZE bsize);
+#endif  // CONFIG_INTERINTRA
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@ -56,7 +56,7 @@ static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
 }

 static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
-  return vp9_is_valid_scale(sf) &&
+  return sf && vp9_is_valid_scale(sf) &&
         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }

--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@ -30,7 +30,36 @@ typedef struct {
 } scan_order;

 extern const scan_order vp9_default_scan_orders[TX_SIZES];
-extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
+extern const scan_order vp9_intra_scan_orders[TX_SIZES][TX_TYPES];
+#if CONFIG_EXT_TX
+extern const scan_order vp9_inter_scan_orders[TX_SIZES][TOTAL_TX_TYPES];
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_TX_SKIP
+// pixel domain default scan orders
+extern const scan_order vp9_default_scan_orders_pxd[TX_SIZES];
+
+extern int16_t vp9_default_scan_pxd_4x4[16];
+extern int16_t vp9_default_scan_pxd_8x8[64];
+extern int16_t vp9_default_scan_pxd_16x16[256];
+extern int16_t vp9_default_scan_pxd_32x32[1024];
+
+extern int16_t vp9_default_iscan_pxd_4x4[16];
+extern int16_t vp9_default_iscan_pxd_8x8[64];
+extern int16_t vp9_default_iscan_pxd_16x16[256];
+extern int16_t vp9_default_iscan_pxd_32x32[1024];
+
+extern int16_t vp9_default_scan_pxd_4x4_neighbors[17 * MAX_NEIGHBORS];
+extern int16_t vp9_default_scan_pxd_8x8_neighbors[65 * MAX_NEIGHBORS];
+extern int16_t vp9_default_scan_pxd_16x16_neighbors[257 * MAX_NEIGHBORS];
+extern int16_t vp9_default_scan_pxd_32x32_neighbors[1025 * MAX_NEIGHBORS];
+
+#if CONFIG_TX64X64
+extern int16_t vp9_default_scan_pxd_64x64[4096];
+extern int16_t vp9_default_iscan_pxd_64x64[4096];
+extern int16_t vp9_default_scan_pxd_64x64_neighbors[4097 * MAX_NEIGHBORS];
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_TX_SKIP

 static INLINE int get_coef_context(const int16_t *neighbors,
                                   const uint8_t *token_cache, int c) {
--- a/vp9/common/vp9_sr_txfm.c
+++ b/vp9/common/vp9_sr_txfm.c
@ -0,0 +1,298 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "vp9/common/vp9_sr_txfm.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_idwt.h"
+
+#if CONFIG_SR_MODE
+int is_enable_srmode(BLOCK_SIZE bsize) {
+  TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  return (max_tx_size >= MIN_SR_TX_SIZE &&
+          max_tx_size <= MAX_SR_TX_SIZE);
+}
+
+// Extending blocks border by copying the boundary pixels
+// For convolution use
+static void sr_extend(int16_t *src, int src_stride, int w, int h,
+                      int16_t *src_ext, int src_ext_stride, int border) {
+  int16_t *src_ext_ori = src_ext;
+  int i, j;
+
+  src_ext = src_ext_ori - border;
+  for (i = 0; i < h; i ++) {
+    for (j = 0; j < border; j ++)
+      src_ext[j] = src[0];
+    vpx_memcpy(src_ext + border, src, sizeof(int16_t) * w);
+    for (j = 0; j < border; j ++)
+      src_ext[border + w + j] = src[w - 1];
+    src_ext += src_ext_stride;
+    src += src_stride;
+  }
+
+  src_ext = src_ext_ori - border * src_ext_stride - border;
+  for (i = 0; i < border; i ++)
+    vpx_memcpy(src_ext + i * src_ext_stride, src_ext_ori - border,
+               sizeof(int16_t) * (w + 2 * border));
+
+  src_ext = src_ext_ori + h * src_ext_stride - border;
+  for (i = 0; i < border; i ++)
+    vpx_memcpy(src_ext + i * src_ext_stride,
+               src_ext_ori + (h - 1) * src_ext_stride - border,
+               sizeof(int16_t) * (w + 2 * border));
+}
+
+static void convolve_horiz(int16_t *src, int src_stride, int src_offset,
+                           int16_t *dst, int dst_stride, int dst_offset,
+                           int *x_filter, int filter_taps, int fil_offset,
+                           int w, int h) {
+  // src_offset, dst_offset: offsets to move to the next value (usually 1)
+  // fil_offset: offsets from filter center to the first pixel of filter
+  // (e.g: 3-tap filter (1, 2, 1),    fil_offset=1;
+  //       4-tap filter (1, 2, 2, 1), fil_offset=1)
+  int x, y;
+  int round_offset = 1 << (UPSCALE_FILTER_SHIFT - 1);
+
+  // Shift the buffer to the first pixel of filter
+  src -= (fil_offset * src_offset);
+
+  for (y = 0; y < h; ++y) {
+    int16_t *src_x = src;
+    for (x = 0; x < w; ++x) {
+      int k, sum = 0;
+
+      // If the filter is symmetric, can fold it first, then multiply
+      for (k = 0; k < filter_taps; ++k)
+        sum += src_x[k * src_offset] * x_filter[k];
+      src_x += src_offset;
+      dst[x * dst_offset] = (sum + round_offset) >> UPSCALE_FILTER_SHIFT;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(int16_t *src, int src_stride, int src_offset,
+                          int16_t *dst, int dst_stride, int dst_offset,
+                          int *y_filter, int filter_taps, int fil_offset,
+                          int w, int h) {
+  int x, y;
+  int round_offset = 1 << (UPSCALE_FILTER_SHIFT - 1);
+
+  // Shift the buffer to the first pixel of filter
+  src -= src_stride * fil_offset;
+
+  for (x = 0; x < w; ++x) {
+    int16_t *src_y = src;
+    for (y = 0; y < h; ++y) {
+      int k, sum = 0;
+      for (k = 0; k < filter_taps; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      src_y += src_stride;
+      dst[y * dst_stride] = (sum + round_offset) >> UPSCALE_FILTER_SHIFT;
+    }
+    src += src_offset;
+    dst += dst_offset;
+  }
+}
+
+/* If changing filter taps, need to change some parameters below too */
+int lp_filter[UPSCALE_FILTER_TAPS - 1] = {2, -14, -2, 43, 70, 43, -2, -14, 2};
+int interpl_filter[UPSCALE_FILTER_TAPS] =
+    // {1, -4, 10, -23, 80, 80, -23, 10, -4, 1};  // lanczos
+    // {0, -1,  6, -19, 78, 78, -19,  6, -1, 0};  // laplacian
+    {0, -4, 11, -23, 80, 80, -23, 11, -4, 0};  // DCT
+    // {0, -1, -4,  14, 55, 55,  14, -4, -1, 0};  // freqmultiplier = 0.5
+
+#if SR_USE_MULTI_F
+// For multiple interplation filter options
+int post_filter[SR_USFILTER_NUM_D][UPSCALE_FILTER_TAPS - 1] = {
+      {2, 0, -7, 0, 137, 0, -7, 0, 2},
+      {1, 0, -7, 0, 139, 0, -7, 0, 1},
+      {4, 0, -5, 0, 134, 0, -5, 0, 4},
+      {-2, 0, -8, 0, 149, 0, -8, 0, -2}
+    };
+#else
+int post_filter[UPSCALE_FILTER_TAPS - 1] =
+    {2, 0, -7, 0, 137, 0, -7, 0, 2};
+#endif
+
+#define buf_size (64 + UPSCALE_FILTER_TAPS*2)
+static void sr_convolution(int16_t *src, int src_stride, int src_offset,
+                           int16_t *dst, int dst_stride, int dst_offset,
+                           int * fil_hor, int * fil_ver,
+                           int fil_offset_h, int fil_offset_v,
+                           int filter_taps,
+                           int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, int16_t, tmp_buf, buf_size * buf_size);
+  int tmp_buf_stride = buf_size;
+  int16_t *tmp_buf_ori = tmp_buf + fil_offset_v * tmp_buf_stride + fil_offset_h;
+
+  convolve_horiz(
+      src - fil_offset_v * src_stride, src_stride, src_offset,
+      tmp_buf_ori - fil_offset_v * tmp_buf_stride, tmp_buf_stride, 1,
+      fil_hor, filter_taps, fil_offset_h, w, h + filter_taps);
+
+  convolve_vert(
+      tmp_buf_ori, tmp_buf_stride, 1,
+      dst, dst_stride, dst_offset,
+      fil_ver, filter_taps, fil_offset_v, w, h);
+}
+
+
+void sr_lowpass(int16_t *src, int src_stride, int16_t *dst, int dst_stride,
+                int w, int h) {
+  int filter_taps = UPSCALE_FILTER_TAPS - 1;  // odd number of taps
+  int border = (filter_taps - 1) >> 1;
+  int fil_offset = border;  // see "fil_offset" in "convolve_horiz"
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, src_ext, buf_size * buf_size);
+  int src_ext_stride = buf_size;
+  int16_t *src_ext_ori = src_ext + border * src_ext_stride + border;
+
+  // extension
+  sr_extend(src, src_stride, w, h, src_ext_ori, src_ext_stride, border);
+  sr_convolution(src_ext_ori, src_ext_stride, 1, dst, dst_stride, 1,
+                 lp_filter, lp_filter, fil_offset, fil_offset,
+                 filter_taps, w, h);
+}
+
+static void sr_upsample(int16_t *src, int src_stride,
+                        int16_t *dst, int dst_stride, int w, int h) {
+  // Apply interpolation filter
+  int filter_taps = UPSCALE_FILTER_TAPS;  // even number of taps
+  int border = (filter_taps >> 1);  // maximum pixels needs to extended
+  int fil_offset = border - 1;  // see "fil_offset" in "convolve_horiz"
+
+  int i, j;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, src_ext, buf_size * buf_size);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, tmp_buf, buf_size * buf_size);
+  int src_ext_stride = buf_size, tmp_buf_stride = buf_size;
+  int16_t *src_ext_ori = src_ext + border * src_ext_stride + border;
+  int16_t *tmp_buf_ori = tmp_buf + border * tmp_buf_stride + border;
+  int16_t *dst_ori = dst;
+
+  // Extend the buffer
+  sr_extend(src, src_stride, w, h, src_ext_ori, src_ext_stride, border);
+
+  // Keep the original pixels the same
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      dst[j << 1] = src[j];
+    }
+    dst += (dst_stride << 1);
+    src += src_stride;
+  }
+
+  convolve_horiz(src_ext_ori - border * src_ext_stride, src_ext_stride, 1,
+                 tmp_buf_ori - border * tmp_buf_stride, tmp_buf_stride, 1,
+                 interpl_filter, filter_taps, fil_offset, w, h + (2 * border));
+
+  // Set the horizontally interpolated pixels
+  dst = dst_ori;
+  tmp_buf = tmp_buf_ori;
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      dst[(j << 1) + 1] = tmp_buf[j];
+    }
+    dst += (dst_stride << 1);
+    tmp_buf += tmp_buf_stride;
+  }
+
+  // Set the vertically interpolated pixels
+  convolve_vert(src_ext_ori, src_ext_stride, 1,
+                dst_ori + dst_stride, 2 * dst_stride, 2,
+                interpl_filter, filter_taps, fil_offset, w, h);
+
+  // Set the horizontally and vertically interpolated pixels
+  convolve_vert(tmp_buf_ori, tmp_buf_stride, 1,
+                dst_ori + dst_stride + 1, 2 * dst_stride, 2,
+                interpl_filter, filter_taps, fil_offset, w, h);
+}
+
+#if SR_USE_MULTI_F
+static void sr_post_filter(int16_t *src, int src_stride,
+                           int16_t *dst, int dst_stride,
+                           int w, int h, int f_hor, int f_ver) {
+#else
+static void sr_post_filter(int16_t *src, int src_stride,
+                           int16_t *dst, int dst_stride, int w, int h) {
+#endif
+  int filter_taps = UPSCALE_FILTER_TAPS - 1;  // odd number of taps
+  int border = (filter_taps - 1) >> 1;
+  int fil_offset = border;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, src_ext, buf_size * buf_size);
+  int src_ext_stride = buf_size;
+  int16_t *src_ext_ori = src_ext + border * src_ext_stride + border;
+
+  // extension
+  sr_extend(src, src_stride, w, h, src_ext_ori, src_ext_stride, border);
+  sr_convolution(src_ext_ori, src_ext_stride, 1, dst, dst_stride, 1,
+#if SR_USE_MULTI_F
+                 post_filter[f_hor], post_filter[f_ver],
+                 fil_offset, fil_offset,
+#else
+                 post_filter, post_filter, fil_offset, fil_offset,
+#endif
+                 filter_taps, w, h);
+}
+
+#if SR_USE_MULTI_F
+void sr_recon(int16_t *src, int src_stride, uint8_t *dst, int dst_stride,
+              int w, int h, int f_hor, int f_ver) {
+#else
+void sr_recon(int16_t *src, int src_stride, uint8_t *dst, int dst_stride,
+              int w, int h) {
+#endif
+  DECLARE_ALIGNED_ARRAY(16, int16_t, recon, 64 * 64);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, us_resi, 64 * 64);
+  int i, j, us_resi_stride = 64;
+  int recon_stride = 64;
+#if USE_POST_F
+  DECLARE_ALIGNED_ARRAY(16, int16_t, enh_recon, 64 * 64);
+  int enh_recon_stride = 64;
+#endif
+
+  // Upsample residual
+  sr_upsample(src, src_stride, us_resi, us_resi_stride, w/2, h/2);
+
+  // Add upsampled residual to prediction
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      recon[i * recon_stride + j] = dst[i * dst_stride + j] +
+                                    us_resi[i * us_resi_stride + j];
+    }
+  }
+
+#if USE_POST_F
+  // Do super-resolution post processing to the reconstruction
+#if SR_USE_MULTI_F
+  sr_post_filter(recon, recon_stride, enh_recon, enh_recon_stride, w, h,
+                 f_hor, f_ver);
+#else  // SR_USE_MULTI_F
+  sr_post_filter(recon, recon_stride, enh_recon, enh_recon_stride, w, h);
+#endif  // SR_USE_MULTI_F
+
+  for (i = 0; i < h; i++)
+    for (j = 0; j < w; j++)
+      dst[i * dst_stride + j] = clip_pixel(enh_recon[i * enh_recon_stride + j]);
+#else  // USE_POST_F
+  for (i = 0; i < h; i++)
+    for (j = 0; j < w; j++)
+      dst[i * dst_stride + j] = clip_pixel(recon[i * recon_stride + j]);
+#endif  // USE_POST_F
+}
+#endif  // CONFIG_SR_MODE
--- a/vp9/common/vp9_sr_txfm.h
+++ b/vp9/common/vp9_sr_txfm.h
@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_PALETTE_H_
+#define VP9_COMMON_VP9_PALETTE_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_entropymode.h"
+
+#if CONFIG_SR_MODE
+#define MIN_SR_TX_SIZE 2
+
+#if CONFIG_TX_64X64
+#define MAX_SR_TX_SIZE 4
+#else
+#define MAX_SR_TX_SIZE 3
+#endif
+
+#define UPSCALE_FILTER_TAPS 10
+#define UPSCALE_FILTER_SHIFT 7
+void sr_lowpass(int16_t *src, int src_stride, int16_t *dst, int dst_stride,
+                int w, int h);
+#if SR_USE_MULTI_F
+void sr_recon(int16_t *src, int src_stride, uint8_t *dst, int dst_stride,
+              int w, int h, int f_hor, int f_ver);
+static INLINE int idx_to_h(int idx) {
+  return (idx % SR_USFILTER_NUM_D);
+}
+static INLINE int idx_to_v(int idx) {
+  return (idx / SR_USFILTER_NUM_D);
+}
+static INLINE int hv_to_idx(int hor, int ver) {
+  return hor + ver * SR_USFILTER_NUM_D;
+}
+#else
+void sr_recon(int16_t *src, int src_stride, uint8_t *dst, int dst_stride,
+              int w, int h);
+#endif
+int is_enable_srmode(BLOCK_SIZE bsize);
+#endif  // CONFIG_SR_MODE
+
+#endif  // VP9_COMMON_VP9_PALETTE_H_
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@ -31,10 +31,19 @@ void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) {
  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
 }

+#if CONFIG_ROW_TILE
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
+  tile->mi_row_start = row * cm->tile_height;
+  tile->mi_row_end   = MIN(tile->mi_row_start + cm->tile_height, cm->mi_rows);
+  tile->mi_col_start = col * cm->tile_width;
+  tile->mi_col_end   = MIN(tile->mi_col_start + cm->tile_width, cm->mi_cols);
+}
+#else
 void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
  vp9_tile_set_row(tile, cm, row);
  vp9_tile_set_col(tile, cm, col);
 }
+#endif

 void vp9_get_tile_n_bits(int mi_cols,
                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
--- a/vp9/common/x86/vp9_high_intrapred_sse2.asm
+++ b/vp9/common/x86/vp9_high_intrapred_sse2.asm
@ -345,7 +345,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one

 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
  movd                  m2, [aboveq-2]
  mova                  m0, [aboveq]
  mova                  m1, [aboveq+16]
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@ -9,6 +9,56 @@
 */

 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+#include "vp9/common/vp9_idct.h"
+
+#include "vp9/common/vp9_enums.h"
+
+#if CONFIG_EXT_TX
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) do {             \
+  __m128i *tmp;                                 \
+  fliplr_16x8(in0);                             \
+  fliplr_16x8(in1);                             \
+  tmp = (in0);                                  \
+  (in0) = (in1);                                \
+  (in1) = tmp;                                  \
+} while (0)
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+#endif

 #define RECON_AND_STORE4X4(dest, in_x) \
 {                                                     \
@ -125,12 +175,12 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

  // Reconstruction and Store
  {
-     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+     __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-     d0 = _mm_unpacklo_epi32(d0,
-          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
-     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
-                    *(const int *) (dest + stride * 3)), d2);
+     __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+     d0 = _mm_unpacklo_epi32(d0, d1);
+     d2 = _mm_unpacklo_epi32(d3, d2);
     d0 = _mm_unpacklo_epi8(d0, zero);
     d2 = _mm_unpacklo_epi8(d2, zero);
     d0 = _mm_add_epi16(d0, input2);
@ -270,22 +320,50 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct4_sse2(in);
      idct4_sse2(in);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct4_sse2(in);
      iadst4_sse2(in);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst4_sse2(in);
      idct4_sse2(in);
      break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
      iadst4_sse2(in);
      iadst4_sse2(in);
      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;
@ -874,22 +952,50 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct8_sse2(in);
      idct8_sse2(in);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct8_sse2(in);
      iadst8_sse2(in);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst8_sse2(in);
      idct8_sse2(in);
      break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
      iadst8_sse2(in);
      iadst8_sse2(in);
      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+    case DCT_FLIPADST:
+      iadst8_sse2(in);
+      idct8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
+      break;
+    case ADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;
@ -2330,29 +2436,59 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) {

 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                               int tx_type) {
-  __m128i in0[16], in1[16];
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];

  load_buffer_8x16(input, in0);
  input += 8;
  load_buffer_8x16(input, in1);

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
      iadst16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;
@ -3985,3 +4121,573 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
    dest += 8 - (stride * 32);
  }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+    __m128i ubounded, retval;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi16(1);
+    const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+    ubounded = _mm_cmpgt_epi16(value, max);
+    retval = _mm_andnot_si128(ubounded, value);
+    ubounded = _mm_and_si128(ubounded, max);
+    retval = _mm_or_si128(retval, ubounded);
+    retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+    return retval;
+}
+
+void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm,  min_input, max_input;
+  int test;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i*)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct4(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(d0,
+           _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(d2,
+           _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vp9_highbd_idct4(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 4),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)),   temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 5),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)),   temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 5),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i   ], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i   ], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 6),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i   ], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i   ], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 6),
+                                          bd);
+    }
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
--- a/Show More
+++ b/Show More