Merge remote-tracking branch 'origin/master' into nextgenv2

Periodic merge to get master changes into nextgenv2. Change-Id: I6f0e4b470f193da03f1a8cb8e6a93ae39395699a
2015-09-17 11:20:03 -07:00 · 2015-09-17 11:20:03 -07:00 · 09ff5f2792
commit 09ff5f2792
parent 31341374d7 50f944272c
192 changed files with 17698 additions and 8622 deletions
--- a/.gitignore
+++ b/.gitignore
@ -30,14 +30,17 @@
 /examples/decode_with_partial_drops
 /examples/example_xma
 /examples/postproc
+/examples/resize_util
 /examples/set_maps
 /examples/simple_decoder
 /examples/simple_encoder
 /examples/twopass_encoder
 /examples/vp8_multi_resolution_encoder
 /examples/vp8cx_set_ref
+/examples/vp9_lossless_encoder
 /examples/vp9_spatial_scalable_encoder
 /examples/vpx_temporal_scalable_patterns
+/examples/vpx_temporal_svc_encoder
 /ivfdec
 /ivfdec.dox
 /ivfenc
@ -45,12 +48,14 @@
 /libvpx.so*
 /libvpx.ver
 /samples.dox
+/test_intra_pred_speed
 /test_libvpx
 /vp8_api1_migration.dox
 /vp[89x]_rtcd.h
 /vpx.pc
 /vpx_config.c
 /vpx_config.h
+/vpx_dsp_rtcd.h
 /vpx_scale_rtcd.h
 /vpx_version.h
 /vpxdec
--- a/build/make/Makefile
+++ b/build/make/Makefile
@ -140,6 +140,8 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
 $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
 $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%vp9_reconintra.c.d: CFLAGS += $(STACKREALIGN)
+$(BUILD_PFX)%vp9_reconintra.c.o: CFLAGS += $(STACKREALIGN)

 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
@ -285,7 +287,7 @@ define archive_template
 # for creating them.
 $(1):
 	$(if $(quiet),@echo "    [AR] $$@")
-	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$?
+	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef

 define so_template
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@ -428,7 +428,7 @@ NM=${NM}

 CFLAGS  = ${CFLAGS}
 CXXFLAGS  = ${CXXFLAGS}
-ARFLAGS = -rus\$(if \$(quiet),c,v)
+ARFLAGS = -crs\$(if \$(quiet),,v)
 LDFLAGS = ${LDFLAGS}
 ASFLAGS = ${ASFLAGS}
 extralibs = ${extralibs}
@ -728,13 +728,6 @@ process_common_toolchain() {
  # Handle darwin variants. Newer SDKs allow targeting older
  # platforms, so use the newest one available.
  case ${toolchain} in
-    arm*-darwin*)
-      ios_sdk_dir="$(show_darwin_sdk_path iphoneos)"
-      if [ -d "${ios_sdk_dir}" ]; then
-        add_cflags  "-isysroot ${ios_sdk_dir}"
-        add_ldflags "-isysroot ${ios_sdk_dir}"
-      fi
-      ;;
    *-darwin*)
      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
      if [ -d "${osx_sdk_dir}" ]; then
@ -810,14 +803,7 @@ process_common_toolchain() {
          if disabled neon && enabled neon_asm; then
            die "Disabling neon while keeping neon-asm is not supported"
          fi
-          case ${toolchain} in
-            *-darwin*)
-              # Neon is guaranteed on iOS 6+ devices, while old media extensions
-              # no longer assemble with iOS 9 SDK
-              ;;
-            *)
-              soft_enable media
-          esac
+          soft_enable media
          ;;
        armv6)
          soft_enable media
@ -1081,7 +1067,9 @@ EOF
          CROSS=${CROSS:-g}
          ;;
        os2)
+          disable_feature pic
          AS=${AS:-nasm}
+          add_ldflags -Zhigh-mem
          ;;
      esac

@ -1323,12 +1311,6 @@ EOF
    add_cflags -D_LARGEFILE_SOURCE
    add_cflags -D_FILE_OFFSET_BITS=64
  fi
-
-  # append any user defined extra cflags
-  if [ -n "${extra_cflags}" ] ; then
-    check_add_cflags ${extra_cflags} || \
-    die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
-  fi
 }

 process_toolchain() {
--- a/7
+++ b/7
@ -265,6 +265,7 @@ EXPERIMENT_LIST="
    fp_mb_stats
    emulate_hardware
    ext_tx
+    misc_fixes
 "
 CONFIG_LIST="
    dependency_tracking
@ -717,6 +718,12 @@ EOF
    esac
    # libwebm needs to be linked with C++ standard library
    enabled webm_io && LD=${CXX}
+
+    # append any user defined extra cflags
+    if [ -n "${extra_cflags}" ] ; then
+        check_add_cflags ${extra_cflags} || \
+        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+    fi
 }


--- a/examples.mk
+++ b/examples.mk
@ -36,6 +36,8 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                third_party/libyuv/source/scale_neon64.cc \
                third_party/libyuv/source/scale_win.cc \

+LIBWEBM_COMMON_SRCS += third_party/libwebm/webmids.hpp
+
 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
                      third_party/libwebm/mkvmuxerutil.cpp \
                      third_party/libwebm/mkvwriter.cpp \
@ -43,8 +45,7 @@ LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
                      third_party/libwebm/mkvmuxertypes.hpp \
                      third_party/libwebm/mkvmuxerutil.hpp \
                      third_party/libwebm/mkvparser.hpp \
-                      third_party/libwebm/mkvwriter.hpp \
-                      third_party/libwebm/webmids.hpp
+                      third_party/libwebm/mkvwriter.hpp

 LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
                      third_party/libwebm/mkvreader.cpp \
@ -68,6 +69,7 @@ ifeq ($(CONFIG_LIBYUV),yes)
  vpxdec.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
  vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
  vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
@ -89,6 +91,7 @@ ifeq ($(CONFIG_LIBYUV),yes)
  vpxenc.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxenc.SRCS                 += $(LIBWEBM_COMMON_SRCS)
  vpxenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)
  vpxenc.SRCS                 += webmenc.cc webmenc.h
 endif
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@ -25,6 +25,7 @@
 #include "../tools_common.h"
 #include "../video_writer.h"

+#include "../vpx_ports/vpx_timer.h"
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
@ -79,6 +80,8 @@ static const arg_def_t rc_end_usage_arg =
    ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
 static const arg_def_t speed_arg =
    ARG_DEF("sp", "speed", 1, "speed configuration");
+static const arg_def_t aqmode_arg =
+    ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");

 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@ -100,7 +103,7 @@ static const arg_def_t *svc_args[] = {
  &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
  &max_bitrate_arg,   &temporal_layers_arg, &temporal_layering_mode_arg,
-  &lag_in_frame_arg,  &threads_arg,
+  &lag_in_frame_arg,  &threads_arg,       &aqmode_arg,
 #if OUTPUT_RC_STATS
  &output_rc_stats_arg,
 #endif
@ -220,6 +223,8 @@ static void parse_command_line(int argc, const char **argv_,
 #endif
    } else if (arg_match(&arg, &speed_arg, argi)) {
      svc_ctx->speed = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &aqmode_arg, argi)) {
+      svc_ctx->aqmode = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &threads_arg, argi)) {
      svc_ctx->threads = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) {
@ -564,6 +569,8 @@ int main(int argc, const char **argv) {
  double sum_bitrate2 = 0.0;
  double framerate  = 30.0;
 #endif
+  struct vpx_usec_timer timer;
+  int64_t cx_time = 0;
  memset(&svc_ctx, 0, sizeof(svc_ctx));
  svc_ctx.log_print = 1;
  exec_name = argv[0];
@ -632,6 +639,9 @@ int main(int argc, const char **argv) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
  if (svc_ctx.threads)
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+  if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+

  // Encode frames
  while (!end_of_stream) {
@ -643,9 +653,12 @@ int main(int argc, const char **argv) {
      end_of_stream = 1;
    }

+    vpx_usec_timer_start(&timer);
    res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
                         pts, frame_duration, svc_ctx.speed >= 5 ?
                         VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY);
+    vpx_usec_timer_mark(&timer);
+    cx_time += vpx_usec_timer_elapsed(&timer);

    printf("%s", vpx_svc_get_message(&svc_ctx));
    if (res != VPX_CODEC_OK) {
@ -784,6 +797,10 @@ int main(int argc, const char **argv) {
    }
  }
 #endif
+  printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+         frame_cnt,
+         1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+         1000000 * (double)frame_cnt / (double)cx_time);
  vpx_img_free(&raw);
  // display average size, psnr
  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
--- a/libs.mk
+++ b/libs.mk
@ -53,7 +53,7 @@ CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
 include $(SRC_PATH_BARE)/vpx_util/vpx_util.mk
 CODEC_SRCS-yes += $(addprefix vpx_util/,$(call enabled,UTIL_SRCS))

-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)
  VP8_PREFIX=vp8/
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
 endif
@ -76,7 +76,7 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
  CODEC_DOC_SECTIONS += vp8 vp8_decoder
 endif

-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)
  VP9_PREFIX=vp9/
  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
 endif
@ -110,7 +110,7 @@ VP9_PREFIX=vp9/
 $(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra

 #  VP10 make file
-ifneq ($(CONFIG_VP10_ENCODER)$(CONFIG_VP10_DECODER),)
+ifeq ($(CONFIG_VP10),yes)
  VP10_PREFIX=vp10/
  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10_common.mk
 endif
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@ -40,30 +40,6 @@ static int round(double x) {
 #endif

 const int kNumCoeffs = 256;
-const double PI = 3.1415926535898;
-void reference2_16x16_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 16; ++l) {
-    for (int k = 0; k < 16; ++k) {
-      double s = 0;
-      for (int i = 0; i < 16; ++i) {
-        for (int j = 0; j < 16; ++j) {
-          x = cos(PI * j * (l + 0.5) / 16.0) *
-              cos(PI * i * (k + 0.5) / 16.0) *
-              input[i * 16 + j] / 256;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k*16+l] = s;
-    }
-  }
-}
-
-
 const double C1 = 0.995184726672197;
 const double C2 = 0.98078528040323;
 const double C3 = 0.956940335732209;
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@ -195,6 +195,7 @@ void EncoderTest::RunLoop(VideoSource *video) {

    video->Begin();
    encoder->InitEncoder(video);
+    ASSERT_FALSE(::testing::Test::HasFatalFailure());

    unsigned long dec_init_flags = 0;  // NOLINT
    // Use fragment decoder if encoder outputs partitions.
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@ -20,10 +20,11 @@ const int kMaxErrorFrames = 12;
 const int kMaxDroppableFrames = 12;

 class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, bool> {
 protected:
  ErrorResilienceTestLarge()
      : EncoderTest(GET_PARAM(0)),
+        svc_support_(GET_PARAM(2)),
        psnr_(0.0),
        nframes_(0),
        mismatch_psnr_(0.0),
@ -193,6 +194,8 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
     pattern_switch_ = frame_switch;
   }

+  bool svc_support_;
+
 private:
  double psnr_;
  unsigned int nframes_;
@ -302,6 +305,10 @@ TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
 // two layer temporal pattern. The base layer does not predict from the top
 // layer, so successful decoding is expected.
 TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
+  // This test doesn't run if SVC is not supported.
+  if (!svc_support_)
+    return;
+
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 500;
@ -347,6 +354,10 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
 // for a two layer temporal pattern, where at some point in the
 // sequence, the LAST ref is not used anymore.
 TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) {
+  // This test doesn't run if SVC is not supported.
+  if (!svc_support_)
+    return;
+
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 500;
@ -579,9 +590,13 @@ TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) {
  }
 }

-VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                          ::testing::Values(true));
 VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls,
                          ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
-VP10_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                          ::testing::Values(true));
+// SVC-related tests don't run for VP10 since SVC is not supported.
+VP10_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                           ::testing::Values(false));
 }  // namespace
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@ -74,7 +74,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
  // size or almost 1 gig of memory.
  // In total the allocations will exceed 2GiB which may cause a failure with
  // mingw + wine, use a smaller size in that case.
-#if defined(_WIN32) && !defined(_WIN64)
+#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
  video.SetSize(4096, 3072);
 #else
  video.SetSize(4096, 4096);
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@ -67,43 +67,6 @@ void reference_dct_2d(int16_t input[64], double output[64]) {
    output[i] *= 2;
 }

-void reference_idct_1d(double input[8], double output[8]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kSqrt2 = 1.414213562373095048801688724209698;
-  for (int k = 0; k < 8; k++) {
-    output[k] = 0.0;
-    for (int n = 0; n < 8; n++) {
-      output[k] += input[n]*cos(kPi*(2*k+1)*n/16.0);
-      if (n == 0)
-        output[k] = output[k]/kSqrt2;
-    }
-  }
-}
-
-void reference_idct_2d(double input[64], int16_t output[64]) {
-  double out[64], out2[64];
-  // First transform rows
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j)
-      temp_in[j] = input[j + i*8];
-    reference_idct_1d(temp_in, temp_out);
-    for (int j = 0; j < 8; ++j)
-      out[j + i*8] = temp_out[j];
-  }
-  // Then transform columns
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j)
-      temp_in[j] = out[j*8 + i];
-    reference_idct_1d(temp_in, temp_out);
-    for (int j = 0; j < 8; ++j)
-      out2[j*8 + i] = temp_out[j];
-  }
-  for (int i = 0; i < 64; ++i)
-    output[i] = round(out2[i]/32);
-}
-
 TEST(VP9Idct8x8Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = 10000;
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@ -145,7 +145,7 @@ TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
 }

 const DecodeParam kVP9InvalidFileInvalidPeekTests[] = {
-  {1, "invalid-vp90-01-v2.webm"},
+  {1, "invalid-vp90-01-v3.webm"},
 };

 VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@ -590,7 +590,9 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif

--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@ -81,6 +81,15 @@ static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;

+struct FrameInfo {
+  FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+      : pts(_pts), w(_w), h(_h) {}
+
+  vpx_codec_pts_t pts;
+  unsigned int w;
+  unsigned int h;
+};
+
 unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
  if (frame < 10)
    return val;
@ -120,15 +129,6 @@ class ResizeTest : public ::libvpx_test::EncoderTest,

  virtual ~ResizeTest() {}

-  struct FrameInfo {
-    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
-        : pts(_pts), w(_w), h(_h) {}
-
-    vpx_codec_pts_t pts;
-    unsigned int w;
-    unsigned int h;
-  };
-
  virtual void SetUp() {
    InitializeConfig();
    SetMode(GET_PARAM(1));
@ -261,6 +261,134 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  }
 }

+class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
+  public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  ResizeInternalRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ResizeInternalRealtimeTest() {}
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+    }
+
+    if (change_bitrate_ && video->frame() == 120) {
+      change_bitrate_ = false;
+      cfg_.rc_target_bitrate = 500;
+      encoder->Config(&cfg_);
+    }
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  void DefaultConfig() {
+    cfg_.g_w = 352;
+    cfg_.g_h = 288;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.kf_mode = VPX_KF_AUTO;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+    // Enable dropped frames.
+    cfg_.rc_dropframe_thresh = 1;
+    // Enable error_resilience mode.
+    cfg_.g_error_resilient  = 1;
+    // Enable dynamic resizing.
+    cfg_.rc_resize_allowed = 1;
+    // Run at low bitrate.
+    cfg_.rc_target_bitrate = 200;
+  }
+
+  std::vector< FrameInfo > frame_info_list_;
+  int set_cpu_used_;
+  bool change_bitrate_;
+};
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Run at low bitrate, with resize_allowed = 1, and verify that we get
+// one resize down event.
+TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDown) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  DefaultConfig();
+  change_bitrate_ = false;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      // Verify that resize down occurs.
+      ASSERT_LT(info->w, last_w);
+      ASSERT_LT(info->h, last_h);
+      last_w = info->w;
+      last_h = info->h;
+      resize_count++;
+    }
+  }
+
+  // Verify that we get 1 resize down event in this test.
+  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Start at low target bitrate, raise the bitrate in the middle of the clip,
+// scaling-up should occur after bitrate changed.
+TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  DefaultConfig();
+  change_bitrate_ = true;
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      resize_count++;
+      if (resize_count == 1) {
+        // Verify that resize down occurs.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+      } else if (resize_count == 2) {
+        // Verify that resize up occurs.
+        ASSERT_GT(info->w, last_w);
+        ASSERT_GT(info->h, last_h);
+      }
+      last_w = info->w;
+      last_h = info->h;
+    }
+  }
+
+  // Verify that we get 2 resize events in this test.
+  ASSERT_EQ(2, resize_count) << "Resizing should occur twice.";
+}
+
 vpx_img_fmt_t CspForFrameNumber(int frame) {
  if (frame < 10)
    return VPX_IMG_FMT_I420;
@ -371,6 +499,9 @@ VP9_INSTANTIATE_TEST_CASE(ResizeTest,
                          ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
                          ::testing::Values(::libvpx_test::kOnePassBest));
+VP9_INSTANTIATE_TEST_CASE(ResizeInternalRealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 9));
 VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
                          ::testing::Values(::libvpx_test::kRealTime));
 }  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@ -687,8 +687,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
 endif  # CONFIG_VP9_HIGHBITDEPTH

 # Invalid files for testing libvpx error checking.
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@ -6,8 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
-fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v2.webm
-25751f5d3b05ff03f0719ad42cd625348eb8961e *invalid-vp90-01-v2.webm.res
+fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
 8e2eff4af87d2b561cce2365713269e301457ef3 *invalid-vp90-02-v2.webm.res
 df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
--- a/test/test.mk
+++ b/test/test.mk
@ -167,6 +167,10 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c

+## VP10
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
+
 endif # CONFIG_SHARED

 include $(SRC_PATH_BARE)/test/test-data.mk
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@ -26,6 +26,7 @@ extern void vpx_dsp_rtcd();
 extern void vpx_scale_rtcd();
 }

+#if ARCH_X86 || ARCH_X86_64
 static void append_negative_gtest_filter(const char *str) {
  std::string filter = ::testing::FLAGS_gtest_filter;
  // Negative patterns begin with one '-' followed by a ':' separated list.
@ -33,6 +34,7 @@ static void append_negative_gtest_filter(const char *str) {
  filter += str;
  ::testing::FLAGS_gtest_filter = filter;
 }
+#endif  // ARCH_X86 || ARCH_X86_64

 int main(int argc, char **argv) {
  ::testing::InitGoogleTest(&argc, argv);
@ -55,7 +57,7 @@ int main(int argc, char **argv) {
    append_negative_gtest_filter(":AVX.*:AVX/*");
  if (!(simd_caps & HAS_AVX2))
    append_negative_gtest_filter(":AVX2.*:AVX2/*");
-#endif
+#endif  // ARCH_X86 || ARCH_X86_64

 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
--- a/test/util.h
+++ b/test/util.h
@ -19,8 +19,7 @@
 // Macros
 #define GET_PARAM(k) std::tr1::get< k >(GetParam())

-static double compute_psnr(const vpx_image_t *img1,
-                           const vpx_image_t *img2) {
+inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
  assert((img1->fmt == img2->fmt) &&
         (img1->d_w == img2->d_w) &&
         (img1->d_h == img2->d_h));
--- a/test/video_source.h
+++ b/test/video_source.h
@ -48,7 +48,7 @@ static std::string GetDataPath() {
 #undef TO_STRING
 #undef STRINGIFY

-static FILE *OpenTestDataFile(const std::string& file_name) {
+inline FILE *OpenTestDataFile(const std::string& file_name) {
  const std::string path_to_source = GetDataPath() + "/" + file_name;
  return fopen(path_to_source.c_str(), "rb");
 }
--- a/test/vp10_dct_test.cc
+++ b/test/vp10_dct_test.cc
@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <new>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx_ports/msvc.h"
+
+#undef CONFIG_COEFFICIENT_RANGE_CHECKING
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
+#include "vp10/encoder/dct.c"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+void reference_dct_1d(const double *in, double *out, int size) {
+  const double PI = 3.141592653589793238462643383279502884;
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+typedef void (*FdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*FdctFunc)(const tran_low_t *in, tran_low_t *out);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+  virtual ~TransTestBase() {}
+
+ protected:
+  void RunFwdAccuracyCheck() {
+    tran_low_t *input  = new tran_low_t[txfm_size_];
+    tran_low_t *output = new tran_low_t[txfm_size_];
+    double *ref_input  = new double[txfm_size_];
+    double *ref_output = new double[txfm_size_];
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    for (int ti =  0; ti < count_test_block; ++ti) {
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        input[ni] = rnd.Rand8() - rnd.Rand8();
+        ref_input[ni] = static_cast<double>(input[ni]);
+      }
+
+      fwd_txfm_(input, output);
+      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        EXPECT_LE(
+            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+            max_error_);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+
+  double max_error_;
+  int txfm_size_;
+  FdctFunc fwd_txfm_;
+  FdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<FdctFunc, FdctFuncRef, int, int> FdctParam;
+class Vp10FwdTxfm
+    : public TransTestBase,
+      public ::testing::TestWithParam<FdctParam> {
+ public:
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = GET_PARAM(1);
+    txfm_size_ = GET_PARAM(2);
+    max_error_ = GET_PARAM(3);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(Vp10FwdTxfm, RunFwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10FwdTxfm,
+    ::testing::Values(
+        FdctParam(&fdct4, &reference_dct_1d, 4, 1),
+        FdctParam(&fdct8, &reference_dct_1d, 8, 1),
+        FdctParam(&fdct16, &reference_dct_1d, 16, 2),
+        FdctParam(&fdct32, &reference_dct_1d, 32, 4)));
+}  // namespace
--- a/test/vp10_inv_txfm_test.cc
+++ b/test/vp10_inv_txfm_test.cc
@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/scan.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const double PI = 3.141592653589793238462643383279502884;
+const double kInvSqrt2 = 0.707106781186547524400844362104;
+
+void reference_idct_1d(const double *in, double *out, int size) {
+  for (int n = 0; n < size; ++n) {
+    out[n] = 0;
+    for (int k = 0; k < size; ++k) {
+      if (k == 0)
+        out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+      else
+        out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+  }
+}
+
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+  virtual ~TransTestBase() {}
+
+ protected:
+  void RunInvAccuracyCheck() {
+    tran_low_t *input  = new tran_low_t[txfm_size_];
+    tran_low_t *output = new tran_low_t[txfm_size_];
+    double *ref_input  = new double[txfm_size_];
+    double *ref_output = new double[txfm_size_];
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    for (int ti =  0; ti < count_test_block; ++ti) {
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        input[ni] = rnd.Rand8() - rnd.Rand8();
+        ref_input[ni] = static_cast<double>(input[ni]);
+      }
+
+      fwd_txfm_(input, output);
+      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        EXPECT_LE(
+            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+            max_error_);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+
+  double max_error_;
+  int txfm_size_;
+  IdctFunc fwd_txfm_;
+  IdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
+class Vp10InvTxfm
+    : public TransTestBase,
+      public ::testing::TestWithParam<IdctParam> {
+ public:
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = GET_PARAM(1);
+    txfm_size_ = GET_PARAM(2);
+    max_error_ = GET_PARAM(3);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10InvTxfm,
+    ::testing::Values(
+        IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
+        IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
+        IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
+        IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
+);
+
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef std::tr1::tuple<FwdTxfmFunc,
+                        InvTxfmFunc,
+                        InvTxfmFunc,
+                        TX_SIZE, int> PartialInvTxfmParam;
+const int kMaxNumCoeffs = 1024;
+class Vp10PartialIDctTest
+    : public ::testing::TestWithParam<PartialInvTxfmParam> {
+ public:
+  virtual ~Vp10PartialIDctTest() {}
+  virtual void SetUp() {
+    ftxfm_ = GET_PARAM(0);
+    full_itxfm_ = GET_PARAM(1);
+    partial_itxfm_ = GET_PARAM(2);
+    tx_size_  = GET_PARAM(3);
+    last_nonzero_ = GET_PARAM(4);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int last_nonzero_;
+  TX_SIZE tx_size_;
+  FwdTxfmFunc ftxfm_;
+  InvTxfmFunc full_itxfm_;
+  InvTxfmFunc partial_itxfm_;
+};
+
+TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+
+  const int count_test_block = 1000;
+  const int block_size = size * size;
+
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
+
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      if (i == 0) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = 255;
+      } else if (i == 1) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = -255;
+      } else {
+        for (int j = 0; j < block_size; ++j) {
+          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        }
+      }
+
+      ftxfm_(input_extreme_block, output_ref_block, size);
+
+      // quantization with maximum allowed step sizes
+      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+      for (int j = 1; j < last_nonzero_; ++j)
+        test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
+                         = (output_ref_block[j] / 1828) * 1828;
+    }
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+
+TEST_P(Vp10PartialIDctTest, ResultsMatch) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+  const int count_test_block = 1000;
+  const int max_coeff = 32766 / 4;
+  const int block_size = size * size;
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+    int max_energy_leftover = max_coeff * max_coeff;
+    for (int j = 0; j < last_nonzero_; ++j) {
+      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+                                          (rnd.Rand16() - 32768) / 65536);
+      max_energy_leftover -= coef * coef;
+      if (max_energy_leftover < 0) {
+        max_energy_leftover = 0;
+        coef = 0;
+      }
+      test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
+    }
+
+    memcpy(test_coef_block2, test_coef_block1,
+           sizeof(*test_coef_block2) * block_size);
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vp10_idct32x32_1024_add_c,
+                   &vp10_idct32x32_34_add_c,
+                   TX_32X32, 34),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vp10_idct32x32_1024_add_c,
+                   &vp10_idct32x32_1_add_c,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vp10_idct16x16_256_add_c,
+                   &vp10_idct16x16_10_add_c,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vp10_idct16x16_256_add_c,
+                   &vp10_idct16x16_1_add_c,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vp10_idct8x8_64_add_c,
+                   &vp10_idct8x8_12_add_c,
+                   TX_8X8, 12),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vp10_idct8x8_64_add_c,
+                   &vp10_idct8x8_1_add_c,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vp10_idct4x4_16_add_c,
+                   &vp10_idct4x4_1_add_c,
+                   TX_4X4, 1)));
+}  // namespace
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@ -14,38 +14,12 @@
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "test/yuv_video_source.h"
 #include "vp9/decoder/vp9_decoder.h"

-typedef vpx_codec_stream_info_t vp9_stream_info_t;
-struct vpx_codec_alg_priv {
-  vpx_codec_priv_t        base;
-  vpx_codec_dec_cfg_t     cfg;
-  vp9_stream_info_t       si;
-  struct VP9Decoder      *pbi;
-  int                     postproc_cfg_set;
-  vp8_postproc_cfg_t      postproc_cfg;
-  vpx_decrypt_cb          decrypt_cb;
-  void                   *decrypt_state;
-  vpx_image_t             img;
-  int                     img_avail;
-  int                     flushed;
-  int                     invert_tile_order;
-  int                     frame_parallel_decode;
-
-  // External frame buffer info to save for VP9 common.
-  void *ext_priv;  // Private data associated with the external frame buffers.
-  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
-};
-
-static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
-  return (vpx_codec_alg_priv_t *)ctx->priv;
-}
+#include "vp9/vp9_dx_iface.c"

 namespace {

-const unsigned int kFramerate = 50;
 const int kCpuUsed = 2;

 struct EncodePerfTestVideo {
@ -66,35 +40,26 @@ struct EncodeParameters {
  int32_t lossless;
  int32_t error_resilient;
  int32_t frame_parallel;
+  int32_t color_range;
  vpx_color_space_t cs;
  // TODO(JBB): quantizers / bitrate
 };

 const EncodeParameters kVP9EncodeParameterSet[] = {
-    {0, 0, 0, 1, 0, VPX_CS_BT_601},
-    {0, 0, 0, 0, 0, VPX_CS_BT_709},
-    {0, 0, 1, 0, 0, VPX_CS_BT_2020},
-    {0, 2, 0, 0, 1, VPX_CS_UNKNOWN},
-    // TODO(JBB): Test profiles (requires more work).
+  {0, 0, 0, 1, 0, 0, VPX_CS_BT_601},
+  {0, 0, 0, 0, 0, 1, VPX_CS_BT_709},
+  {0, 0, 1, 0, 0, 1, VPX_CS_BT_2020},
+  {0, 2, 0, 0, 1, 0, VPX_CS_UNKNOWN},
+  // TODO(JBB): Test profiles (requires more work).
 };

-int is_extension_y4m(const char *filename) {
-  const char *dot = strrchr(filename, '.');
-  if (!dot || dot == filename)
-    return 0;
-  else
-    return !strcmp(dot, ".y4m");
-}
-
 class VpxEncoderParmsGetToDecoder
    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<EncodeParameters, \
+      public ::libvpx_test::CodecTestWith2Params<EncodeParameters,
                                                 EncodePerfTestVideo> {
 protected:
  VpxEncoderParmsGetToDecoder()
-      : EncoderTest(GET_PARAM(0)),
-        encode_parms(GET_PARAM(1)) {
-  }
+      : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}

  virtual ~VpxEncoderParmsGetToDecoder() {}

@ -112,6 +77,7 @@ class VpxEncoderParmsGetToDecoder
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
      encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
+      encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
      encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                       encode_parms.frame_parallel);
@ -126,33 +92,34 @@ class VpxEncoderParmsGetToDecoder
  }

  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource& video,
+                                  const libvpx_test::VideoSource &video,
                                  libvpx_test::Decoder *decoder) {
-    vpx_codec_ctx_t* vp9_decoder = decoder->GetDecoder();
-    vpx_codec_alg_priv_t* priv =
-        (vpx_codec_alg_priv_t*) get_alg_priv(vp9_decoder);
-
-    VP9Decoder* pbi = priv->pbi;
-    VP9_COMMON* common = &pbi->common;
+    vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
+    vpx_codec_alg_priv_t *const priv =
+        reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
+    FrameWorkerData *const worker_data =
+        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
+    VP9_COMMON *const common = &worker_data->pbi->common;

    if (encode_parms.lossless) {
-      EXPECT_EQ(common->base_qindex, 0);
-      EXPECT_EQ(common->y_dc_delta_q, 0);
-      EXPECT_EQ(common->uv_dc_delta_q, 0);
-      EXPECT_EQ(common->uv_ac_delta_q, 0);
-      EXPECT_EQ(common->tx_mode, ONLY_4X4);
+      EXPECT_EQ(0, common->base_qindex);
+      EXPECT_EQ(0, common->y_dc_delta_q);
+      EXPECT_EQ(0, common->uv_dc_delta_q);
+      EXPECT_EQ(0, common->uv_ac_delta_q);
+      EXPECT_EQ(ONLY_4X4, common->tx_mode);
    }
-    EXPECT_EQ(common->error_resilient_mode, encode_parms.error_resilient);
+    EXPECT_EQ(encode_parms.error_resilient, common->error_resilient_mode);
    if (encode_parms.error_resilient) {
-      EXPECT_EQ(common->frame_parallel_decoding_mode, 1);
-      EXPECT_EQ(common->use_prev_frame_mvs, 0);
+      EXPECT_EQ(1, common->frame_parallel_decoding_mode);
+      EXPECT_EQ(0, common->use_prev_frame_mvs);
    } else {
-      EXPECT_EQ(common->frame_parallel_decoding_mode,
-                encode_parms.frame_parallel);
+      EXPECT_EQ(encode_parms.frame_parallel,
+                common->frame_parallel_decoding_mode);
    }
-    EXPECT_EQ(common->color_space, encode_parms.cs);
-    EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
-    EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
+    EXPECT_EQ(encode_parms.color_range, common->color_range);
+    EXPECT_EQ(encode_parms.cs, common->color_space);
+    EXPECT_EQ(encode_parms.tile_cols, common->log2_tile_cols);
+    EXPECT_EQ(encode_parms.tile_rows, common->log2_tile_rows);

    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
    return VPX_CODEC_OK == res_dec;
@ -164,35 +131,18 @@ class VpxEncoderParmsGetToDecoder
  EncodeParameters encode_parms;
 };

-// TODO(hkuang): This test conflicts with frame parallel decode. So disable it
-// for now until fix.
-TEST_P(VpxEncoderParmsGetToDecoder, DISABLED_BitstreamParms) {
+TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) {
  init_flags_ = VPX_CODEC_USE_PSNR;

-  libvpx_test::VideoSource *video;
-  if (is_extension_y4m(test_video_.name)) {
-    video = new libvpx_test::Y4mVideoSource(test_video_.name,
-                                            0, test_video_.frames);
-  } else {
-    video = new libvpx_test::YUVVideoSource(test_video_.name,
-                                            VPX_IMG_FMT_I420,
-                                            test_video_.width,
-                                            test_video_.height,
-                                            kFramerate, 1, 0,
-                                            test_video_.frames);
-  }
+  libvpx_test::VideoSource *const video =
+      new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames);
+  ASSERT_TRUE(video != NULL);

  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
-  delete(video);
+  delete video;
 }

-VP9_INSTANTIATE_TEST_CASE(
-    VpxEncoderParmsGetToDecoder,
-    ::testing::ValuesIn(kVP9EncodeParameterSet),
-    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
-
-VP10_INSTANTIATE_TEST_CASE(
-    VpxEncoderParmsGetToDecoder,
-    ::testing::ValuesIn(kVP9EncodeParameterSet),
-    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+VP9_INSTANTIATE_TEST_CASE(VpxEncoderParmsGetToDecoder,
+                          ::testing::ValuesIn(kVP9EncodeParameterSet),
+                          ::testing::ValuesIn(kVP9EncodePerfTestVectors));
 }  // namespace
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@ -1,7 +1,10 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 2dec09426ab62b794464cc9971bd135b4d313e65
+Version: 476366249e1fda7710a389cd41c57db42305e0d4
 License: BSD
 License File: LICENSE.txt

 Description:
 libwebm is used to handle WebM container I/O.
+
+Local Changes:
+* <none>
--- a/third_party/libwebm/mkvmuxer.hpp
+++ b/third_party/libwebm/mkvmuxer.hpp
@ -528,7 +528,7 @@ class Tracks {
 public:
  // Audio and video type defined by the Matroska specs.
  enum { kVideo = 0x1, kAudio = 0x2 };
-  // Opus, Vorbis, VP8, and VP9 codec ids defined by the Matroska specs.
+
  static const char kOpusCodecId[];
  static const char kVorbisCodecId[];
  static const char kVp8CodecId[];
--- a/third_party/libwebm/mkvparser.cpp
+++ b/third_party/libwebm/mkvparser.cpp
--- a/third_party/libwebm/mkvparser.hpp
+++ b/third_party/libwebm/mkvparser.hpp
@ -9,12 +9,13 @@
 #ifndef MKVPARSER_HPP
 #define MKVPARSER_HPP

-#include <cstdlib>
-#include <cstdio>
 #include <cstddef>
+#include <cstdio>
+#include <cstdlib>

 namespace mkvparser {

+const int E_PARSE_FAILED = -1;
 const int E_FILE_FORMAT_INVALID = -2;
 const int E_BUFFER_NOT_FULL = -3;

@ -27,8 +28,11 @@ class IMkvReader {
  virtual ~IMkvReader();
 };

+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+                                             unsigned long long element_size);
 long long GetUIntLength(IMkvReader*, long long, long&);
 long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
 long long UnserializeUInt(IMkvReader*, long long pos, long long size);

 long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
@ -833,7 +837,7 @@ class Cues {

 private:
  bool Init() const;
-  void PreloadCuePoint(long&, long long) const;
+  bool PreloadCuePoint(long&, long long) const;

  mutable CuePoint** m_cue_points;
  mutable long m_count;
@ -999,8 +1003,8 @@ class Segment {
  long DoLoadClusterUnknownSize(long long&, long&);
  long DoParseNext(const Cluster*&, long long&, long&);

-  void AppendCluster(Cluster*);
-  void PreloadCluster(Cluster*, ptrdiff_t);
+  bool AppendCluster(Cluster*);
+  bool PreloadCluster(Cluster*, ptrdiff_t);

  // void ParseSeekHead(long long pos, long long size);
  // void ParseSeekEntry(long long pos, long long size);
--- a/third_party/libwebm/webmids.hpp
+++ b/third_party/libwebm/webmids.hpp
@ -41,6 +41,7 @@ enum MkvId {
  kMkvTimecodeScale = 0x2AD7B1,
  kMkvDuration = 0x4489,
  kMkvDateUTC = 0x4461,
+  kMkvTitle = 0x7BA9,
  kMkvMuxingApp = 0x4D80,
  kMkvWritingApp = 0x5741,
  // Cluster
@ -107,9 +108,16 @@ enum MkvId {
  kMkvContentEncodingOrder = 0x5031,
  kMkvContentEncodingScope = 0x5032,
  kMkvContentEncodingType = 0x5033,
+  kMkvContentCompression = 0x5034,
+  kMkvContentCompAlgo = 0x4254,
+  kMkvContentCompSettings = 0x4255,
  kMkvContentEncryption = 0x5035,
  kMkvContentEncAlgo = 0x47E1,
  kMkvContentEncKeyID = 0x47E2,
+  kMkvContentSignature = 0x47E3,
+  kMkvContentSigKeyID = 0x47E4,
+  kMkvContentSigAlgo = 0x47E5,
+  kMkvContentSigHashAlgo = 0x47E6,
  kMkvContentEncAESSettings = 0x47E7,
  kMkvAESSettingsCipherMode = 0x47E8,
  kMkvAESSettingsCipherInitData = 0x47E9,
--- a/third_party/x86inc/README.libvpx
+++ b/third_party/x86inc/README.libvpx
@ -20,3 +20,5 @@ Copy PIC 'GLOBAL' macros from x86_abi_support.asm
 Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
 Use .text with no alignment for aout
 Only use 'hidden' visibility with Chromium
+Move '%use smartalign' for nasm out of 'INIT_CPUFLAGS' and before
+  'ALIGNMODE'.
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@ -876,6 +876,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))

+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
@ -912,7 +916,6 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
    %endif

    %ifdef __NASM_VER__
-        %use smartalign
        ALIGNMODE k7
    %elif ARCH_X86_64 || cpuflag(sse2)
        CPU amdnop
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@ -14,6 +14,7 @@

 #include "./vpx_config.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"

@ -69,6 +70,9 @@ typedef struct {
  PREDICTION_MODE mode;
  TX_SIZE tx_size;
  int8_t skip;
+#if CONFIG_MISC_FIXES
+  int8_t has_no_coeffs;
+#endif
  int8_t segment_id;
  int8_t seg_id_predicted;  // valid only when temporal_update is enabled

@ -178,7 +182,6 @@ typedef struct macroblockd {
  int mb_to_bottom_edge;

  FRAME_CONTEXT *fc;
-  int frame_parallel_decoding_mode;

  /* pointers to reference frames */
  RefBuffer *block_refs[2];
@ -286,7 +289,7 @@ static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
    return TX_4X4;
  } else {
    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
-    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
+    return VPXMIN(y_tx_size, max_txsize_lookup[plane_bsize]);
  }
 }

--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h
@ -13,6 +13,7 @@

 #include "vp10/common/enums.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"

 #ifdef __cplusplus
 extern "C" {
@ -35,7 +36,7 @@ static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
 static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};

-// MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
+// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES] =
  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};

--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@ -484,12 +484,12 @@ void vp10_setup_past_independence(VP10_COMMON *cm) {
  vp10_init_mv_probs(cm);
  cm->fc->initialized = 1;

-  if (cm->frame_type == KEY_FRAME ||
-      cm->error_resilient_mode || cm->reset_frame_context == 3) {
+  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
    // Reset all frame contexts.
    for (i = 0; i < FRAME_CONTEXTS; ++i)
      cm->frame_contexts[i] = *cm->fc;
-  } else if (cm->reset_frame_context == 2) {
+  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
    // Reset only the frame context specified in the frame header.
    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
  }
@ -499,7 +499,5 @@ void vp10_setup_past_independence(VP10_COMMON *cm) {
    memset(cm->prev_mip, 0,
           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));

-  vp10_zero(cm->ref_frame_sign_bias);
-
  cm->frame_context_idx = 0;
 }
--- a/vp10/common/entropymv.c
+++ b/vp10/common/entropymv.c
@ -161,17 +161,19 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
  }
 }

-void vp10_inc_mv(const MV *mv, nmv_context_counts *counts) {
+void vp10_inc_mv(const MV *mv, nmv_context_counts *counts, const int usehp) {
  if (counts != NULL) {
    const MV_JOINT_TYPE j = vp10_get_mv_joint(mv);
    ++counts->joints[j];

    if (mv_joint_vertical(j)) {
-      inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+      inc_mv_component(mv->row, &counts->comps[0], 1,
+                       !CONFIG_MISC_FIXES || usehp);
    }

    if (mv_joint_horizontal(j)) {
-      inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+      inc_mv_component(mv->col, &counts->comps[1], 1,
+                       !CONFIG_MISC_FIXES || usehp);
    }
  }
 }
--- a/vp10/common/entropymv.h
+++ b/vp10/common/entropymv.h
@ -124,7 +124,7 @@ typedef struct {
  nmv_component_counts comps[2];
 } nmv_context_counts;

-void vp10_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+void vp10_inc_mv(const MV *mv, nmv_context_counts *mvctx, const int usehp);

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@ -13,6 +13,7 @@
 #include "vp10/common/loopfilter.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/reconinter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"

@ -753,8 +754,13 @@ static void build_masks(const loop_filter_info_n *const lfi_n,

  // If the block has no coefficients and is not intra we skip applying
  // the loop filter on block edges.
+#if CONFIG_MISC_FIXES
+  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
+    return;
+#else
  if (mbmi->skip && is_inter_block(mbmi))
    return;
+#endif

  // Here we are adding a mask for the transform size. The transform
  // size mask is set to be correct for a 64x64 prediction block size. We
@ -811,8 +817,13 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
  *above_y |= above_prediction_mask[block_size] << shift_y;
  *left_y |= left_prediction_mask[block_size] << shift_y;

+#if CONFIG_MISC_FIXES
+  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
+    return;
+#else
  if (mbmi->skip && is_inter_block(mbmi))
    return;
+#endif

  *above_y |= (size_mask[block_size] &
               above_64x64_txform_mask[tx_size_y]) << shift_y;
@ -1588,7 +1599,7 @@ void vp10_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
  if (partial_frame && cm->mi_rows > 8) {
    start_mi_row = cm->mi_rows >> 1;
    start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
  }
  end_mi_row = start_mi_row + mi_rows_to_filter;
  vp10_loop_filter_frame_init(cm, frame_filter_level);
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c
@ -125,8 +125,10 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
    }

    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
-        prev_frame_mvs->ref_frame[1] != ref_frame &&
-        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+#if !CONFIG_MISC_FIXES
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int &&
+#endif
+        prev_frame_mvs->ref_frame[1] != ref_frame) {
      int_mv mv = prev_frame_mvs->mv[1];
      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
          ref_sign_bias[ref_frame]) {
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h
@ -180,8 +180,9 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
                        refmv_count, mv_ref_list, Done); \
      if (has_second_ref(mbmi) && \
-          (mbmi)->ref_frame[1] != ref_frame && \
-          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+          (CONFIG_MISC_FIXES || \
+           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) && \
+          (mbmi)->ref_frame[1] != ref_frame) \
        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
                        refmv_count, mv_ref_list, Done); \
    } \
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@ -57,6 +57,29 @@ typedef enum {
  REFERENCE_MODES       = 3,
 } REFERENCE_MODE;

+typedef enum {
+  RESET_FRAME_CONTEXT_NONE = 0,
+  RESET_FRAME_CONTEXT_CURRENT = 1,
+  RESET_FRAME_CONTEXT_ALL = 2,
+} RESET_FRAME_CONTEXT_MODE;
+
+typedef enum {
+  /**
+   * Don't update frame context
+   */
+  REFRESH_FRAME_CONTEXT_OFF,
+  /**
+   * Update frame context to values resulting from forward probability
+   * updates signaled in the frame header
+   */
+  REFRESH_FRAME_CONTEXT_FORWARD,
+  /**
+   * Update frame context to values resulting from backward probability
+   * updates based on entropy/counts in the decoded frame
+   */
+  REFRESH_FRAME_CONTEXT_BACKWARD,
+} REFRESH_FRAME_CONTEXT_MODE;
+
 typedef struct {
  int_mv mv[2];
  MV_REFERENCE_FRAME ref_frame[2];
@ -106,6 +129,7 @@ typedef struct BufferPool {
 typedef struct VP10Common {
  struct vpx_internal_error_info  error;
  vpx_color_space_t color_space;
+  int color_range;
  int width;
  int height;
  int display_width;
@ -161,10 +185,8 @@ typedef struct VP10Common {

  int allow_high_precision_mv;

-  // Flag signaling that the frame context should be reset to default values.
-  // 0 or 1 implies don't reset, 2 reset just the context specified in the
-  // frame header, 3 reset all contexts.
-  int reset_frame_context;
+  // Flag signaling which frame contexts should be reset to default values.
+  RESET_FRAME_CONTEXT_MODE reset_frame_context;

  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
  // MODE_INFO (8-pixel) units.
@ -222,15 +244,15 @@ typedef struct VP10Common {

  loop_filter_info_n lf_info;

-  int refresh_frame_context;    /* Two state 0 = NO, 1 = YES */
+  // Flag signaling how frame contexts should be updated at the end of
+  // a frame decode
+  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;

  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */

  struct loopfilter lf;
  struct segmentation seg;

-  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
-  // in pbi.
  int frame_parallel_decode;  // frame-based threading.

  // Context probabilities for reference frame prediction
@ -255,7 +277,6 @@ typedef struct VP10Common {
 #endif

  int error_resilient_mode;
-  int frame_parallel_decoding_mode;

  int log2_tile_cols, log2_tile_rows;
  int byte_alignment;
@ -370,7 +391,6 @@ static INLINE void vp10_init_macroblockd(VP10_COMMON *cm, MACROBLOCKD *xd,
      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
    }
    xd->fc = cm->fc;
-    xd->frame_parallel_decoding_mode = cm->frame_parallel_decoding_mode;
  }

  xd->above_seg_context = cm->above_seg_context;
--- a/vp10/common/postproc.c
+++ b/vp10/common/postproc.c
@ -16,6 +16,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vp10_rtcd.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_scale/vpx_scale.h"
@ -625,7 +626,7 @@ static void swap_mi_and_prev_mi(VP10_COMMON *cm) {

 int vp10_post_proc_frame(struct VP10Common *cm,
                        YV12_BUFFER_CONFIG *dest, vp10_ppflags_t *ppflags) {
-  const int q = MIN(105, cm->lf.filter_level * 2);
+  const int q = VPXMIN(105, cm->lf.filter_level * 2);
  const int flags = ppflags->post_proc_flag;
  YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
  struct postproc_state *const ppstate = &cm->postproc_state;
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@ -13,6 +13,7 @@

 #include "vp10/common/blockd.h"
 #include "vp10/common/onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"

 #ifdef __cplusplus
 extern "C" {
@ -24,14 +25,14 @@ static INLINE int get_segment_id(const VP10_COMMON *cm,
  const int mi_offset = mi_row * cm->mi_cols + mi_col;
  const int bw = num_8x8_blocks_wide_lookup[bsize];
  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
  int x, y, segment_id = MAX_SEGMENTS;

  for (y = 0; y < ymis; ++y)
    for (x = 0; x < xmis; ++x)
-      segment_id = MIN(segment_id,
-                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);

  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
  return segment_id;
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@ -135,20 +135,26 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
  const int mi_x = mi_col * MI_SIZE;
  const int mi_y = mi_row * MI_SIZE;
  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = 4 * num_4x4_blocks_wide_lookup[bsize] >> pd->subsampling_x;
+    const int bh = 4 * num_4x4_blocks_high_lookup[bsize] >> pd->subsampling_y;

    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
-      int i = 0, x, y;
+      const PARTITION_TYPE bp = bsize - xd->mi[0]->mbmi.sb_type;
+      const int have_vsplit = bp != PARTITION_HORZ;
+      const int have_hsplit = bp != PARTITION_VERT;
+      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+      const int pw = 8 >> (have_vsplit | pd->subsampling_x);
+      const int ph = 8 >> (have_hsplit | pd->subsampling_y);
+      int x, y;
+      assert(bp != PARTITION_NONE && bp < PARTITION_TYPES);
      assert(bsize == BLOCK_8X8);
+      assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
-           build_inter_predictors(xd, plane, i++, bw, bh,
-                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+           build_inter_predictors(xd, plane, y * 2 + x, bw, bh,
+                                  4 * x, 4 * y, pw, ph, mi_x, mi_y);
    } else {
      build_inter_predictors(xd, plane, 0, bw, bh,
                             0, 0, bw, bh, mi_x, mi_y);
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@ -34,14 +34,14 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-static void high_inter_predictor(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int subpel_x,
-                                 const int subpel_y,
-                                 const struct scale_factors *sf,
-                                 int w, int h, int ref,
-                                 const InterpKernel *kernel,
-                                 int xs, int ys, int bd) {
+static INLINE void high_inter_predictor(const uint8_t *src, int src_stride,
+                                        uint8_t *dst, int dst_stride,
+                                        const int subpel_x,
+                                        const int subpel_y,
+                                        const struct scale_factors *sf,
+                                        int w, int h, int ref,
+                                        const InterpKernel *kernel,
+                                        int xs, int ys, int bd) {
  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
      src, src_stride, dst, dst_stride,
      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
@ -77,8 +77,9 @@ static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
 }

 // TODO(jkoleszar): yet another mv clamping function :-(
-static MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
-                                    int bw, int bh, int ss_x, int ss_y) {
+static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+                                           const MV *src_mv,
+                                           int bw, int bh, int ss_x, int ss_y) {
  // If the MV points so far into the UMV border that no visible pixels
  // are used for reconstruction, the subpel part of the MV can be
  // discarded and the MV limited to 16 pixels with equivalent results.
@ -102,8 +103,8 @@ static MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
  return clamped_mv;
 }

-static MV average_split_mvs(const struct macroblockd_plane *pd,
-                            const MODE_INFO *mi, int ref, int block) {
+static INLINE MV average_split_mvs(const struct macroblockd_plane *pd,
+                                   const MODE_INFO *mi, int ref, int block) {
  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
  MV res = {0, 0};
  switch (ss_idx) {
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@ -695,6 +695,13 @@ DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_32x32[1024]) = {
  1023,
 };

+const scan_order vp10_default_scan_orders[TX_SIZES] = {
+  {default_scan_4x4,   vp10_default_iscan_4x4,   default_scan_4x4_neighbors},
+  {default_scan_8x8,   vp10_default_iscan_8x8,   default_scan_8x8_neighbors},
+  {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
+  {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+};
+
 #if CONFIG_EXT_TX
 const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = {
  {  // TX_4X4
--- a/vp10/common/scan.h
+++ b/vp10/common/scan.h
@ -29,6 +29,7 @@ typedef struct {
  const int16_t *neighbors;
 } scan_order;

+extern const scan_order vp10_default_scan_orders[TX_SIZES];
 extern const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES];

 static INLINE int get_coef_context(const int16_t *neighbors,
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@ -9,6 +9,7 @@
 */

 #include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/thread_common.h"
@ -165,7 +166,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
  // Decoder may allocate more threads than number of tiles based on user's
  // input.
  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = MIN(nworkers, tile_cols);
+  const int num_workers = VPXMIN(nworkers, tile_cols);
  int i;

  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
@ -229,7 +230,7 @@ void vp10_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
  if (partial_frame && cm->mi_rows > 8) {
    start_mi_row = cm->mi_rows >> 1;
    start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
  }
  end_mi_row = start_mi_row + mi_rows_to_filter;
  vp10_loop_filter_frame_init(cm, frame_filter_level);
--- a/vp10/common/thread_common.h
+++ b/vp10/common/thread_common.h
@ -14,6 +14,10 @@
 #include "vp10/common/loopfilter.h"
 #include "vpx_util/vpx_thread.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP10Common;
 struct FRAME_COUNTS;

@ -54,4 +58,8 @@ void vp10_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
 void vp10_accumulate_frame_counts(struct VP10Common *cm,
                                 struct FRAME_COUNTS *counts, int is_dec);

+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP10_COMMON_LOOPFILTER_THREAD_H_
--- a/vp10/common/tile_common.c
+++ b/vp10/common/tile_common.c
@ -9,8 +9,8 @@
 */

 #include "vp10/common/tile_common.h"
-
 #include "vp10/common/onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"

 #define MIN_TILE_WIDTH_B64 4
 #define MAX_TILE_WIDTH_B64 64
@ -18,7 +18,7 @@
 static int get_tile_offset(int idx, int mis, int log2) {
  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
  const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
-  return MIN(offset, mis);
+  return VPXMIN(offset, mis);
 }

 void vp10_tile_set_row(TileInfo *tile, const VP10_COMMON *cm, int row) {
--- a/vp10/common/vp10_fwd_txfm.c
+++ b/vp10/common/vp10_fwd_txfm.c
@ -0,0 +1,824 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_fwd_txfm.h"
+
+void vp10_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t input[4];      // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (0 == pass) {
+        input[0] = in_pass0[0 * stride] * 16;
+        input[1] = in_pass0[1 * stride] * 16;
+        input[2] = in_pass0[2 * stride] * 16;
+        input[3] = in_pass0[3 * stride] * 16;
+        if (i == 0 && input[0]) {
+          input[0] += 1;
+        }
+      } else {
+        input[0] = in[0 * 4];
+        input[1] = in[1 * 4];
+        input[2] = in[2 * 4];
+        input[3] = in[3 * 4];
+      }
+      // Transform.
+      step[0] = input[0] + input[3];
+      step[1] = input[1] + input[2];
+      step[2] = input[1] - input[2];
+      step[3] = input[0] - input[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in_pass0++;
+      in++;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+
+  {
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+    }
+  }
+}
+
+void vp10_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 4; ++r)
+    for (c = 0; c < 4; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum << 1;
+  output[1] = 0;
+}
+
+void vp10_fdct8x8_c(const int16_t *input,
+    tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in  = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      final_output[j + i * 8] /= 2;
+  }
+}
+
+void vp10_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 8; ++r)
+    for (c = 0; c < 8; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum;
+  output[1] = 0;
+}
+
+void vp10_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 16; i++) {
+      if (0 == pass) {
+        // Calculate input for the first 8 results.
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+        // Calculate input for the next 8 results.
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+      } else {
+        // Calculate input for the first 8 results.
+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        // Calculate input for the next 8 results.
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
+
+        // stage 1
+        s0 = input[0] + input[7];
+        s1 = input[1] + input[6];
+        s2 = input[2] + input[5];
+        s3 = input[3] + input[4];
+        s4 = input[3] - input[4];
+        s5 = input[2] - input[5];
+        s6 = input[1] - input[6];
+        s7 = input[0] - input[7];
+
+        // fdct4(step, step);
+        x0 = s0 + s3;
+        x1 = s1 + s2;
+        x2 = s1 - s2;
+        x3 = s0 - s3;
+        t0 = (x0 + x1) * cospi_16_64;
+        t1 = (x0 - x1) * cospi_16_64;
+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+        out[0] = (tran_low_t)fdct_round_shift(t0);
+        out[4] = (tran_low_t)fdct_round_shift(t2);
+        out[8] = (tran_low_t)fdct_round_shift(t1);
+        out[12] = (tran_low_t)fdct_round_shift(t3);
+
+        // Stage 2
+        t0 = (s6 - s5) * cospi_16_64;
+        t1 = (s6 + s5) * cospi_16_64;
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
+
+        // Stage 3
+        x0 = s4 + t2;
+        x1 = s4 - t2;
+        x2 = s7 - t3;
+        x3 = s7 + t3;
+
+        // Stage 4
+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        out[2] = (tran_low_t)fdct_round_shift(t0);
+        out[6] = (tran_low_t)fdct_round_shift(t2);
+        out[10] = (tran_low_t)fdct_round_shift(t1);
+        out[14] = (tran_low_t)fdct_round_shift(t3);
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        temp1 = (step1[5] - step1[2]) * cospi_16_64;
+        temp2 = (step1[4] - step1[3]) * cospi_16_64;
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
+        temp1 = (step1[4] + step1[3]) * cospi_16_64;
+        temp2 = (step1[5] + step1[2]) * cospi_16_64;
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
+        // step 3
+        step3[0] = step1[0] + step2[3];
+        step3[1] = step1[1] + step2[2];
+        step3[2] = step1[1] - step2[2];
+        step3[3] = step1[0] - step2[3];
+        step3[4] = step1[7] - step2[4];
+        step3[5] = step1[6] - step2[5];
+        step3[6] = step1[6] + step2[5];
+        step3[7] = step1[7] + step2[4];
+        // step 4
+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
+        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
+        // step 5
+        step1[0] = step3[0] + step2[1];
+        step1[1] = step3[0] - step2[1];
+        step1[2] = step3[3] + step2[2];
+        step1[3] = step3[3] - step2[2];
+        step1[4] = step3[4] - step2[5];
+        step1[5] = step3[4] + step2[5];
+        step1[6] = step3[7] - step2[6];
+        step1[7] = step3[7] + step2[6];
+        // step 6
+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+        out[1] = (tran_low_t)fdct_round_shift(temp1);
+        out[9] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        out[5] = (tran_low_t)fdct_round_shift(temp1);
+        out[13] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+        out[3] = (tran_low_t)fdct_round_shift(temp1);
+        out[11] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        out[7] = (tran_low_t)fdct_round_shift(temp1);
+        out[15] = (tran_low_t)fdct_round_shift(temp2);
+      }
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      in_pass0++;
+      out += 16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+void vp10_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 16; ++r)
+    for (c = 0; c < 16; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 1;
+  output[1] = 0;
+}
+
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+  // and make the bounds consts.
+  // assert(-131072 <= rv && rv <= 131071);
+  return rv;
+}
+
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
+  return rv;
+}
+
+void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+  tran_high_t step[32];
+  // Stage 1
+  step[0] = input[0] + input[(32 - 1)];
+  step[1] = input[1] + input[(32 - 2)];
+  step[2] = input[2] + input[(32 - 3)];
+  step[3] = input[3] + input[(32 - 4)];
+  step[4] = input[4] + input[(32 - 5)];
+  step[5] = input[5] + input[(32 - 6)];
+  step[6] = input[6] + input[(32 - 7)];
+  step[7] = input[7] + input[(32 - 8)];
+  step[8] = input[8] + input[(32 - 9)];
+  step[9] = input[9] + input[(32 - 10)];
+  step[10] = input[10] + input[(32 - 11)];
+  step[11] = input[11] + input[(32 - 12)];
+  step[12] = input[12] + input[(32 - 13)];
+  step[13] = input[13] + input[(32 - 14)];
+  step[14] = input[14] + input[(32 - 15)];
+  step[15] = input[15] + input[(32 - 16)];
+  step[16] = -input[16] + input[(32 - 17)];
+  step[17] = -input[17] + input[(32 - 18)];
+  step[18] = -input[18] + input[(32 - 19)];
+  step[19] = -input[19] + input[(32 - 20)];
+  step[20] = -input[20] + input[(32 - 21)];
+  step[21] = -input[21] + input[(32 - 22)];
+  step[22] = -input[22] + input[(32 - 23)];
+  step[23] = -input[23] + input[(32 - 24)];
+  step[24] = -input[24] + input[(32 - 25)];
+  step[25] = -input[25] + input[(32 - 26)];
+  step[26] = -input[26] + input[(32 - 27)];
+  step[27] = -input[27] + input[(32 - 28)];
+  step[28] = -input[28] + input[(32 - 29)];
+  step[29] = -input[29] + input[(32 - 30)];
+  step[30] = -input[30] + input[(32 - 31)];
+  step[31] = -input[31] + input[(32 - 32)];
+
+  // Stage 2
+  output[0] = step[0] + step[16 - 1];
+  output[1] = step[1] + step[16 - 2];
+  output[2] = step[2] + step[16 - 3];
+  output[3] = step[3] + step[16 - 4];
+  output[4] = step[4] + step[16 - 5];
+  output[5] = step[5] + step[16 - 6];
+  output[6] = step[6] + step[16 - 7];
+  output[7] = step[7] + step[16 - 8];
+  output[8] = -step[8] + step[16 - 9];
+  output[9] = -step[9] + step[16 - 10];
+  output[10] = -step[10] + step[16 - 11];
+  output[11] = -step[11] + step[16 - 12];
+  output[12] = -step[12] + step[16 - 13];
+  output[13] = -step[13] + step[16 - 14];
+  output[14] = -step[14] + step[16 - 15];
+  output[15] = -step[15] + step[16 - 16];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = step[18];
+  output[19] = step[19];
+
+  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+  output[28] = step[28];
+  output[29] = step[29];
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (round) {
+    output[0] = half_round_shift(output[0]);
+    output[1] = half_round_shift(output[1]);
+    output[2] = half_round_shift(output[2]);
+    output[3] = half_round_shift(output[3]);
+    output[4] = half_round_shift(output[4]);
+    output[5] = half_round_shift(output[5]);
+    output[6] = half_round_shift(output[6]);
+    output[7] = half_round_shift(output[7]);
+    output[8] = half_round_shift(output[8]);
+    output[9] = half_round_shift(output[9]);
+    output[10] = half_round_shift(output[10]);
+    output[11] = half_round_shift(output[11]);
+    output[12] = half_round_shift(output[12]);
+    output[13] = half_round_shift(output[13]);
+    output[14] = half_round_shift(output[14]);
+    output[15] = half_round_shift(output[15]);
+
+    output[16] = half_round_shift(output[16]);
+    output[17] = half_round_shift(output[17]);
+    output[18] = half_round_shift(output[18]);
+    output[19] = half_round_shift(output[19]);
+    output[20] = half_round_shift(output[20]);
+    output[21] = half_round_shift(output[21]);
+    output[22] = half_round_shift(output[22]);
+    output[23] = half_round_shift(output[23]);
+    output[24] = half_round_shift(output[24]);
+    output[25] = half_round_shift(output[25]);
+    output[26] = half_round_shift(output[26]);
+    output[27] = half_round_shift(output[27]);
+    output[28] = half_round_shift(output[28]);
+    output[29] = half_round_shift(output[29]);
+    output[30] = half_round_shift(output[30]);
+    output[31] = half_round_shift(output[31]);
+  }
+
+  // Stage 3
+  step[0] = output[0] + output[(8 - 1)];
+  step[1] = output[1] + output[(8 - 2)];
+  step[2] = output[2] + output[(8 - 3)];
+  step[3] = output[3] + output[(8 - 4)];
+  step[4] = -output[4] + output[(8 - 5)];
+  step[5] = -output[5] + output[(8 - 6)];
+  step[6] = -output[6] + output[(8 - 7)];
+  step[7] = -output[7] + output[(8 - 8)];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+  step[14] = output[14];
+  step[15] = output[15];
+
+  step[16] = output[16] + output[23];
+  step[17] = output[17] + output[22];
+  step[18] = output[18] + output[21];
+  step[19] = output[19] + output[20];
+  step[20] = -output[20] + output[19];
+  step[21] = -output[21] + output[18];
+  step[22] = -output[22] + output[17];
+  step[23] = -output[23] + output[16];
+  step[24] = -output[24] + output[31];
+  step[25] = -output[25] + output[30];
+  step[26] = -output[26] + output[29];
+  step[27] = -output[27] + output[28];
+  step[28] = output[28] + output[27];
+  step[29] = output[29] + output[26];
+  step[30] = output[30] + output[25];
+  step[31] = output[31] + output[24];
+
+  // Stage 4
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = -step[2] + step[1];
+  output[3] = -step[3] + step[0];
+  output[4] = step[4];
+  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = -step[10] + step[9];
+  output[11] = -step[11] + step[8];
+  output[12] = -step[12] + step[15];
+  output[13] = -step[13] + step[14];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+  output[22] = step[22];
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = step[25];
+  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // Stage 5
+  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+  step[4] = output[4] + output[5];
+  step[5] = -output[5] + output[4];
+  step[6] = -output[6] + output[7];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+  step[15] = output[15];
+
+  step[16] = output[16] + output[19];
+  step[17] = output[17] + output[18];
+  step[18] = -output[18] + output[17];
+  step[19] = -output[19] + output[16];
+  step[20] = -output[20] + output[23];
+  step[21] = -output[21] + output[22];
+  step[22] = output[22] + output[21];
+  step[23] = output[23] + output[20];
+  step[24] = output[24] + output[27];
+  step[25] = output[25] + output[26];
+  step[26] = -output[26] + output[25];
+  step[27] = -output[27] + output[24];
+  step[28] = -output[28] + output[31];
+  step[29] = -output[29] + output[30];
+  step[30] = output[30] + output[29];
+  step[31] = output[31] + output[28];
+
+  // Stage 6
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+  output[8] = step[8] + step[9];
+  output[9] = -step[9] + step[8];
+  output[10] = -step[10] + step[11];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = -step[13] + step[12];
+  output[14] = -step[14] + step[15];
+  output[15] = step[15] + step[14];
+
+  output[16] = step[16];
+  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+  output[19] = step[19];
+  output[20] = step[20];
+  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+  output[27] = step[27];
+  output[28] = step[28];
+  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+  output[31] = step[31];
+
+  // Stage 7
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+  step[16] = output[16] + output[17];
+  step[17] = -output[17] + output[16];
+  step[18] = -output[18] + output[19];
+  step[19] = output[19] + output[18];
+  step[20] = output[20] + output[21];
+  step[21] = -output[21] + output[20];
+  step[22] = -output[22] + output[23];
+  step[23] = output[23] + output[22];
+  step[24] = output[24] + output[25];
+  step[25] = -output[25] + output[24];
+  step[26] = -output[26] + output[27];
+  step[27] = output[27] + output[26];
+  step[28] = output[28] + output[29];
+  step[29] = -output[29] + output[28];
+  step[30] = -output[30] + output[31];
+  step[31] = output[31] + output[30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[0]  = step[0];
+  output[16] = step[1];
+  output[8]  = step[2];
+  output[24] = step[3];
+  output[4]  = step[4];
+  output[20] = step[5];
+  output[12] = step[6];
+  output[28] = step[7];
+  output[2]  = step[8];
+  output[18] = step[9];
+  output[10] = step[10];
+  output[26] = step[11];
+  output[6]  = step[12];
+  output[22] = step[13];
+  output[14] = step[14];
+  output[30] = step[15];
+
+  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+void vp10_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vp10_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vp10_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+
+// Note that although we use dct_32_round in dct32 computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vp10_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vp10_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      // TODO(cd): see quality impact of only doing
+      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+      //           PS: also change code in vp10_dsp/x86/vp10_dct_sse2.c
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vp10_fdct32(temp_in, temp_out, 1);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] = (tran_low_t)temp_out[j];
+  }
+}
+
+void vp10_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 32; ++r)
+    for (c = 0; c < 32; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 3;
+  output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp10_fdct4x4_c(input, output, stride);
+}
+
+void vp10_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  vp10_fdct8x8_c(input, final_output, stride);
+}
+
+void vp10_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+                            int stride) {
+  vp10_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vp10_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vp10_fdct16x16_c(input, output, stride);
+}
+
+void vp10_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+                              int stride) {
+  vp10_fdct16x16_1_c(input, output, stride);
+}
+
+void vp10_highbd_fdct32x32_c(const int16_t *input,
+    tran_low_t *out, int stride) {
+  vp10_fdct32x32_c(input, out, stride);
+}
+
+void vp10_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                               int stride) {
+  vp10_fdct32x32_rd_c(input, out, stride);
+}
+
+void vp10_highbd_fdct32x32_1_c(const int16_t *input,
+    tran_low_t *out, int stride) {
+  vp10_fdct32x32_1_c(input, out, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp10/common/vp10_fwd_txfm.h
+++ b/vp10/common/vp10_fwd_txfm.h
@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_VP10_FWD_TXFM_H_
+#define VP10_COMMON_VP10_FWD_TXFM_H_
+
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+#endif  // VP10_COMMON_VP10_FWD_TXFM_H_
--- a/vp10/common/vp10_inv_txfm.c
+++ b/vp10/common/vp10_inv_txfm.c
--- a/vp10/common/vp10_inv_txfm.h
+++ b/vp10/common/vp10_inv_txfm.h
@ -0,0 +1,122 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_INV_TXFM_H_
+#define VPX_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_low_t check_range(tran_high_t input) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid VP9 input streams, intermediate stage coefficients should always
+  // stay within the range of a signed 16 bit integer. Coefficients can go out
+  // of this range for invalid/corrupt VP9 streams. However, strictly checking
+  // this range for every intermediate coefficient can burdensome for a decoder,
+  // therefore the following assertion is only enabled when configured with
+  // --enable-coefficient-range-checking.
+  assert(INT16_MIN <= input);
+  assert(input <= INT16_MAX);
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return check_range(rv);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+                                            int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+  const int32_t int_max = (1 << (7 + bd)) - 1;
+  const int32_t int_min = -int_max - 1;
+  assert(int_min <= input);
+  assert(input <= int_max);
+  (void) int_min;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  (void) bd;
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+                                                      int bd) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return highbd_check_range(rv, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) ((int32_t)(x))
+#endif  // CONFIG_EMULATE_HARDWARE
+
+void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct16_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct32_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+  trans = WRAPLOW(trans, 8);
+  return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // VPX_DSP_INV_TXFM_H_
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@ -95,6 +95,57 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
  specialize qw/vp10_iht16x16_256_add/;
+
+  add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct4x4 sse2/;
+
+  add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct4x4_1 sse2/;
+
+  add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct8x8 sse2/;
+
+  add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct8x8_1 sse2/;
+
+  add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct16x16 sse2/;
+
+  add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct16x16_1 sse2/;
+
+  add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct32x32 sse2/;
+
+  add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct32x32_rd sse2/;
+
+  add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fdct32x32_1 sse2/;
+
+  add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct4x4 sse2/;
+
+  add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct8x8 sse2/;
+
+  add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct8x8_1/;
+
+  add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct16x16 sse2/;
+
+  add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct16x16_1/;
+
+  add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct32x32 sse2/;
+
+  add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct32x32_rd sse2/;
+
+  add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fdct32x32_1/;
 } else {
  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
@ -106,6 +157,33 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
    specialize qw/vp10_iht16x16_256_add/;
+
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4/;
+
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1/;
+
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8/;
+
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1/;
+
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16/;
+
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1/;
+
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32/;
+
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd/;
+
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1/;
  } else {
    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp10_iht4x4_16_add sse2 neon dspr2 msa/;
@ -115,6 +193,33 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
    specialize qw/vp10_iht16x16_256_add sse2 dspr2 msa/;
+
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4 sse2/;
+
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1 sse2/;
+
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8 sse2/;
+
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1 sse2/;
+
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16 sse2/;
+
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1 sse2/;
+
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32 sse2/;
+
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd sse2/;
+
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1 sse2/;
  }
 }

@ -289,6 +394,188 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
 }

+# Inverse transform
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct4x4_1_add/;
+
+  add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct4x4_16_add/;
+
+  add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct8x8_1_add/;
+
+  add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct8x8_64_add/;
+
+  add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct8x8_12_add/;
+
+  add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct16x16_1_add/;
+
+  add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct16x16_256_add/;
+
+  add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct16x16_10_add/;
+
+  add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct32x32_1024_add/;
+
+  add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct32x32_34_add/;
+
+  add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct32x32_1_add/;
+
+  add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_iwht4x4_1_add/;
+
+  add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_iwht4x4_16_add/;
+
+  add_proto qw/void vp10_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct4x4_1_add/;
+
+  add_proto qw/void vp10_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct8x8_1_add/;
+
+  add_proto qw/void vp10_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct16x16_1_add/;
+
+  add_proto qw/void vp10_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct32x32_1024_add/;
+
+  add_proto qw/void vp10_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct32x32_34_add/;
+
+  add_proto qw/void vp10_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct32x32_1_add/;
+
+  add_proto qw/void vp10_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_iwht4x4_1_add/;
+
+  add_proto qw/void vp10_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_iwht4x4_16_add/;
+
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct4x4_16_add/;
+
+    add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_64_add/;
+
+    add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_10_add/;
+
+    add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_256_add/;
+
+    add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_10_add/;
+  } else {
+    add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct4x4_16_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_64_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_10_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_256_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_10_add sse2/;
+  }  # CONFIG_EMULATE_HARDWARE
+} else {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_1_add/;
+
+    add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_16_add/;
+
+    add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_1_add/;
+
+    add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_64_add/;
+
+    add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_12_add/;
+
+    add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_1_add/;
+
+    add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_256_add/;
+
+    add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_10_add/;
+
+    add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1024_add/;
+
+    add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_34_add/;
+
+    add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1_add/;
+
+    add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_1_add/;
+
+    add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_16_add/;
+  } else {
+    add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_1_add sse2/;
+
+    add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_16_add sse2/;
+
+    add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_1_add sse2/;
+
+    add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_64_add sse2/;
+
+    add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_12_add sse2/;
+
+    add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_1_add sse2/;
+
+    add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_256_add sse2/;
+
+    add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_10_add sse2/;
+
+    add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1024_add sse2/;
+
+    add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_34_add sse2/;
+
+    add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1_add sse2/;
+
+    add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_1_add/;
+
+    add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_16_add/;
+  }  # CONFIG_EMULATE_HARDWARE
+}  # CONFIG_VP9_HIGHBITDEPTH
+
 #
 # Motion search
 #
--- a/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
+++ b/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
--- a/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
+++ b/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
--- a/vp10/common/x86/vp10_fwd_txfm_sse2.c
+++ b/vp10/common/x86/vp10_fwd_txfm_sse2.c
@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vp10_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0, in1;
+  __m128i tmp;
+  const __m128i zero = _mm_setzero_si128();
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+         (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+         (input +  3 * stride)));
+
+  tmp = _mm_add_epi16(in0, in1);
+  in0 = _mm_unpacklo_epi16(zero, tmp);
+  in1 = _mm_unpackhi_epi16(zero, tmp);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(tmp, zero);
+  in1 = _mm_unpackhi_epi32(tmp, zero);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(tmp, 8);
+
+  in1 = _mm_add_epi32(tmp, in0);
+  in0 = _mm_slli_epi32(in1, 1);
+  store_output(&in0, output);
+}
+
+void vp10_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i u0, u1, sum;
+
+  u0 = _mm_add_epi16(in0, in1);
+  u1 = _mm_add_epi16(in2, in3);
+
+  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  sum = _mm_add_epi16(u0, u1);
+
+  in0 = _mm_add_epi16(in0, in1);
+  in2 = _mm_add_epi16(in2, in3);
+  sum = _mm_add_epi16(sum, in0);
+
+  u0  = _mm_setzero_si128();
+  sum = _mm_add_epi16(sum, in2);
+
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  store_output(&in1, output);
+}
+
+void vp10_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    input += 8 * i;
+    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
+
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 1);
+  store_output(&in1, output);
+}
+
+void vp10_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; ++i) {
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 3);
+  store_output(&in1, output);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D vp10_fdct4x4_sse2
+#define FDCT8x8_2D vp10_fdct8x8_sse2
+#define FDCT16x16_2D vp10_fdct16x16_sse2
+#include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h"
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vp10_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h"
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp10_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h"  // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vp10_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vp10_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vp10_highbd_fdct16x16_sse2
+#include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h" // NOLINT
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vp10_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp10_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp10/common/x86/vp10_inv_txfm_sse2.c
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.c
--- a/vp10/common/x86/vp10_inv_txfm_sse2.h
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.h
@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+                                                        \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+  }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
+  }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  RECON_AND_STORE(dest +  0 * stride, in[0]);
+  RECON_AND_STORE(dest +  1 * stride, in[1]);
+  RECON_AND_STORE(dest +  2 * stride, in[2]);
+  RECON_AND_STORE(dest +  3 * stride, in[3]);
+  RECON_AND_STORE(dest +  4 * stride, in[4]);
+  RECON_AND_STORE(dest +  5 * stride, in[5]);
+  RECON_AND_STORE(dest +  6 * stride, in[6]);
+  RECON_AND_STORE(dest +  7 * stride, in[7]);
+  RECON_AND_STORE(dest +  8 * stride, in[8]);
+  RECON_AND_STORE(dest +  9 * stride, in[9]);
+  RECON_AND_STORE(dest + 10 * stride, in[10]);
+  RECON_AND_STORE(dest + 11 * stride, in[11]);
+  RECON_AND_STORE(dest + 12 * stride, in[12]);
+  RECON_AND_STORE(dest + 13 * stride, in[13]);
+  RECON_AND_STORE(dest + 14 * stride, in[14]);
+  RECON_AND_STORE(dest + 15 * stride, in[15]);
+}
+
+void idct4_sse2(__m128i *in);
+void idct8_sse2(__m128i *in);
+void idct16_sse2(__m128i *in0, __m128i *in1);
+void iadst4_sse2(__m128i *in);
+void iadst8_sse2(__m128i *in);
+void iadst16_sse2(__m128i *in0, __m128i *in1);
+
+#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@ -17,6 +17,7 @@

 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
@ -80,12 +81,18 @@ static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
  return data > max ? max : data;
 }

+#if CONFIG_MISC_FIXES
+static TX_MODE read_tx_mode(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ? TX_MODE_SELECT : vpx_rb_read_literal(rb, 2);
+}
+#else
 static TX_MODE read_tx_mode(vpx_reader *r) {
  TX_MODE tx_mode = vpx_read_literal(r, 2);
  if (tx_mode == ALLOW_32X32)
    tx_mode += vpx_read_bit(r);
  return tx_mode;
 }
+#endif

 static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
  int i, j;
@ -526,6 +533,7 @@ static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,
                                       struct buf_2d *dst_buf, const MV* mv,
                                       RefCntBuffer *ref_frame_buf,
                                       int is_scaled, int ref) {
+  VP10_COMMON *const cm = &pbi->common;
  struct macroblockd_plane *const pd = &xd->plane[plane];
  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
  MV32 scaled_mv;
@ -622,9 +630,9 @@ static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,

    // Wait until reference block is ready. Pad 7 more pixels as last 7
    // pixels of each superblock row can be changed by next superblock row.
-    if (pbi->frame_parallel_decode)
+    if (cm->frame_parallel_decode)
      vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                           MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));

    // Skip border extension if block is inside the frame.
    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
@ -649,10 +657,10 @@ static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,
  } else {
    // Wait until reference block is ready. Pad 7 more pixels as last 7
    // pixels of each superblock row can be changed by next superblock row.
-     if (pbi->frame_parallel_decode) {
+     if (cm->frame_parallel_decode) {
       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
       vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                            MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+                             VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
     }
  }
 #if CONFIG_VP9_HIGHBITDEPTH
@ -699,12 +707,19 @@ static void dec_build_inter_predictors_sb(VP10Decoder *const pbi,
      const int is_scaled = vp10_is_scaled(sf);

      if (sb_type < BLOCK_8X8) {
-        int i = 0, x, y;
+        const PARTITION_TYPE bp = BLOCK_8X8 - sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
+        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
+        int x, y;
        for (y = 0; y < num_4x4_h; ++y) {
          for (x = 0; x < num_4x4_w; ++x) {
-            const MV mv = average_split_mvs(pd, mi, ref, i++);
+            const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
-                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+                                       4 * x, 4 * y, pw, ph, mi_x, mi_y, kernel,
                                       sf, pre_buf, dst_buf, &mv,
                                       ref_frame_buf, is_scaled, ref);
          }
@ -723,8 +738,8 @@ static void dec_build_inter_predictors_sb(VP10Decoder *const pbi,
 static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                         int n4_wl, int n4_hl) {
  // get minimum log2 num4x4s dimension
-  const int x = MIN(n4_wl, n4_hl);
-  return MIN(mbmi->tx_size,  x);
+  const int x = VPXMIN(n4_wl, n4_hl);
+  return VPXMIN(mbmi->tx_size,  x);
 }

 static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
@ -785,8 +800,8 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd,
  const int less8x8 = bsize < BLOCK_8X8;
  const int bw = 1 << (bwl - 1);
  const int bh = 1 << (bhl - 1);
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);

  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
                                   bw, bh, x_mis, y_mis, bwl, bhl);
@ -856,7 +871,11 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd,
      }

      if (!less8x8 && eobtotal == 0)
+#if CONFIG_MISC_FIXES
+        mbmi->has_no_coeffs = 1;  // skip loopfilter
+#else
        mbmi->skip = 1;  // skip loopfilter
+#endif
    }
  }

@ -1011,8 +1030,9 @@ static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
      read_coef_probs_common(fc->coef_probs[tx_size], r);
 }

-static void setup_segmentation(struct segmentation *seg,
+static void setup_segmentation(VP10_COMMON *const cm,
                               struct vpx_read_bit_buffer *rb) {
+  struct segmentation *const seg = &cm->seg;
  int i, j;

  seg->update_map = 0;
@ -1023,13 +1043,21 @@ static void setup_segmentation(struct segmentation *seg,
    return;

  // Segmentation map update
-  seg->update_map = vpx_rb_read_bit(rb);
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    seg->update_map = 1;
+  } else {
+    seg->update_map = vpx_rb_read_bit(rb);
+  }
  if (seg->update_map) {
    for (i = 0; i < SEG_TREE_PROBS; i++)
      seg->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
                                               : MAX_PROB;

-    seg->temporal_update = vpx_rb_read_bit(rb);
+    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+      seg->temporal_update = 0;
+    } else {
+      seg->temporal_update = vpx_rb_read_bit(rb);
+    }
    if (seg->temporal_update) {
      for (i = 0; i < PREDICTION_PROBS; i++)
        seg->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
@ -1080,17 +1108,17 @@ static void setup_loopfilter(struct loopfilter *lf,

      for (i = 0; i < MAX_REF_FRAMES; i++)
        if (vpx_rb_read_bit(rb))
-          lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
+          lf->ref_deltas[i] = vpx_rb_read_inv_signed_literal(rb, 6);

      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
        if (vpx_rb_read_bit(rb))
-          lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
+          lf->mode_deltas[i] = vpx_rb_read_inv_signed_literal(rb, 6);
    }
  }
 }

 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
-  return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0;
+  return vpx_rb_read_bit(rb) ? vpx_rb_read_inv_signed_literal(rb, 4) : 0;
 }

 static void setup_quantization(VP10_COMMON *const cm, MACROBLOCKD *const xd,
@ -1138,12 +1166,7 @@ static void setup_segmentation_dequant(VP10_COMMON *const cm) {
 }

 static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
-  const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH,
-                                              EIGHTTAP,
-                                              EIGHTTAP_SHARP,
-                                              BILINEAR };
-  return vpx_rb_read_bit(rb) ? SWITCHABLE
-                             : literal_to_filter[vpx_rb_read_literal(rb, 2)];
+  return vpx_rb_read_bit(rb) ? SWITCHABLE : vpx_rb_read_literal(rb, 2);
 }

 static void setup_display_size(VP10_COMMON *cm,
@ -1222,6 +1245,7 @@ static void setup_frame_size(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
 }

 static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
@ -1303,6 +1327,7 @@ static void setup_frame_size_with_refs(VP10_COMMON *cm,
  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
 }

 static void setup_tile_info(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
@ -1448,8 +1473,9 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi,
      tile_data->cm = cm;
      tile_data->xd = pbi->mb;
      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
-                             NULL : &cm->counts;
+      tile_data->xd.counts =
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
+              &cm->counts : NULL;
      vp10_zero(tile_data->dqcoeff);
      vp10_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
@ -1504,7 +1530,7 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi,
      // After loopfiltering, the last 7 row pixels in each superblock row may
      // still be changed by the longest loopfilter of the next superblock
      // row.
-      if (pbi->frame_parallel_decode)
+      if (cm->frame_parallel_decode)
        vp10_frameworker_broadcast(pbi->cur_buf,
                                  mi_row << MI_BLOCK_SIZE_LOG2);
    }
@ -1522,7 +1548,7 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi,
  // Get last tile data.
  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;

-  if (pbi->frame_parallel_decode)
+  if (cm->frame_parallel_decode)
    vp10_frameworker_broadcast(pbi->cur_buf, INT_MAX);
  return vpx_reader_find_end(&tile_data->bit_reader);
 }
@ -1570,7 +1596,7 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
  const int tile_cols = 1 << cm->log2_tile_cols;
  const int tile_rows = 1 << cm->log2_tile_rows;
-  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
+  const int num_workers = VPXMIN(pbi->max_threads & ~1, tile_cols);
  TileBuffer tile_buffers[1][1 << 6];
  int n;
  int final_worker = -1;
@ -1637,7 +1663,7 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
    int group_start = 0;
    while (group_start < tile_cols) {
      const TileBuffer largest = tile_buffers[0][group_start];
-      const int group_end = MIN(group_start + num_workers, tile_cols) - 1;
+      const int group_end = VPXMIN(group_start + num_workers, tile_cols) - 1;
      memmove(tile_buffers[0] + group_start, tile_buffers[0] + group_start + 1,
              (group_end - group_start) * sizeof(tile_buffers[0][0]));
      tile_buffers[0][group_end] = largest;
@ -1646,7 +1672,7 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
  }

  // Initialize thread frame counts.
-  if (!cm->frame_parallel_decoding_mode) {
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
    int i;

    for (i = 0; i < num_workers; ++i) {
@ -1668,8 +1694,9 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
      tile_data->pbi = pbi;
      tile_data->xd = pbi->mb;
      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
-                             0 : &tile_data->counts;
+      tile_data->xd.counts =
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
+              &tile_data->counts : NULL;
      vp10_zero(tile_data->dqcoeff);
      vp10_tile_init(tile, cm, 0, buf->col);
      vp10_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
@ -1708,7 +1735,8 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
    }

    // Accumulate thread frame counts.
-    if (n >= tile_cols && !cm->frame_parallel_decoding_mode) {
+    if (n >= tile_cols &&
+        cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
      for (i = 0; i < num_workers; ++i) {
        TileWorkerData *const tile_data =
            (TileWorkerData*)pbi->tile_workers[i].data1;
@ -1740,7 +1768,8 @@ static void read_bitdepth_colorspace_sampling(
  }
  cm->color_space = vpx_rb_read_literal(rb, 3);
  if (cm->color_space != VPX_CS_SRGB) {
-    vpx_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
+    // [16,235] (including xvycc) vs [0,255] range
+    cm->color_range = vpx_rb_read_bit(rb);
    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
      cm->subsampling_x = vpx_rb_read_bit(rb);
      cm->subsampling_y = vpx_rb_read_bit(rb);
@ -1771,6 +1800,9 @@ static void read_bitdepth_colorspace_sampling(
 static size_t read_uncompressed_header(VP10Decoder *pbi,
                                       struct vpx_read_bit_buffer *rb) {
  VP10_COMMON *const cm = &pbi->common;
+#if CONFIG_MISC_FIXES
+  MACROBLOCKD *const xd = &pbi->mb;
+#endif
  BufferPool *const pool = cm->buffer_pool;
  RefCntBuffer *const frame_bufs = pool->frame_bufs;
  int i, mask, ref_index = 0;
@ -1812,7 +1844,7 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
    cm->lf.filter_level = 0;
    cm->show_frame = 1;

-    if (pbi->frame_parallel_decode) {
+    if (cm->frame_parallel_decode) {
      for (i = 0; i < REF_FRAMES; ++i)
        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
    }
@ -1844,8 +1876,33 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  } else {
    cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);

-    cm->reset_frame_context = cm->error_resilient_mode ?
-        0 : vpx_rb_read_literal(rb, 2);
+    if (cm->error_resilient_mode) {
+        cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
+    } else {
+#if CONFIG_MISC_FIXES
+      if (cm->intra_only) {
+          cm->reset_frame_context =
+              vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
+                                  : RESET_FRAME_CONTEXT_CURRENT;
+      } else {
+          cm->reset_frame_context =
+              vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_CURRENT
+                                  : RESET_FRAME_CONTEXT_NONE;
+          if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT)
+            cm->reset_frame_context =
+                  vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
+                                      : RESET_FRAME_CONTEXT_CURRENT;
+      }
+#else
+      static const RESET_FRAME_CONTEXT_MODE reset_frame_context_conv_tbl[4] = {
+        RESET_FRAME_CONTEXT_NONE, RESET_FRAME_CONTEXT_NONE,
+        RESET_FRAME_CONTEXT_CURRENT, RESET_FRAME_CONTEXT_ALL
+      };
+
+      cm->reset_frame_context =
+          reset_frame_context_conv_tbl[vpx_rb_read_literal(rb, 2)];
+#endif
+    }

    if (cm->intra_only) {
      if (!vp10_read_sync_code(rb))
@ -1859,6 +1916,7 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
        // specifies that the default color format should be YUV 4:2:0 in this
        // case (normative).
        cm->color_space = VPX_CS_BT_601;
+        cm->color_range = 0;
        cm->subsampling_y = cm->subsampling_x = 1;
        cm->bit_depth = VPX_BITS_8;
 #if CONFIG_VP9_HIGHBITDEPTH
@ -1909,6 +1967,7 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
 #endif
  get_frame_new_buffer(cm)->color_space = cm->color_space;
+  get_frame_new_buffer(cm)->color_range = cm->color_range;

  if (pbi->need_resync) {
    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@ -1917,11 +1976,20 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  }

  if (!cm->error_resilient_mode) {
-    cm->refresh_frame_context = vpx_rb_read_bit(rb);
-    cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
+    cm->refresh_frame_context =
+        vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
+                            : REFRESH_FRAME_CONTEXT_OFF;
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
+        cm->refresh_frame_context =
+            vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
+                                : REFRESH_FRAME_CONTEXT_BACKWARD;
+#if !CONFIG_MISC_FIXES
+    } else {
+      vpx_rb_read_bit(rb);  // parallel decoding mode flag
+#endif
+    }
  } else {
-    cm->refresh_frame_context = 0;
-    cm->frame_parallel_decoding_mode = 1;
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
  }

  // This flag will be overridden by the call to vp10_setup_past_independence
@ -1957,8 +2025,11 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,

  setup_loopfilter(&cm->lf, rb);
  setup_quantization(cm, &pbi->mb, rb);
-  setup_segmentation(&cm->seg, rb);
+  setup_segmentation(cm, rb);
  setup_segmentation_dequant(cm);
+#if CONFIG_MISC_FIXES
+  cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(rb);
+#endif

  setup_tile_info(cm, rb);
  sz = vpx_rb_read_literal(rb, 16);
@ -1984,7 +2055,9 @@ static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
 static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
                                  size_t partition_size) {
  VP10_COMMON *const cm = &pbi->common;
+#if !CONFIG_MISC_FIXES
  MACROBLOCKD *const xd = &pbi->mb;
+#endif
  FRAME_CONTEXT *const fc = cm->fc;
  vpx_reader r;
  int k;
@ -1994,7 +2067,9 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate bool decoder 0");

+#if !CONFIG_MISC_FIXES
  cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
+#endif
  if (cm->tx_mode == TX_MODE_SELECT)
    read_tx_mode_probs(&fc->tx_probs, &r);
  read_coef_probs(fc, cm->tx_mode, &r);
@ -2044,7 +2119,8 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
 static void debug_check_frame_counts(const VP10_COMMON *const cm) {
  FRAME_COUNTS zero_counts;
  vp10_zero(zero_counts);
-  assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode);
+  assert(cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD ||
+         cm->error_resilient_mode);
  assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode,
                 sizeof(cm->counts.y_mode)));
  assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode,
@ -2087,7 +2163,7 @@ static struct vpx_read_bit_buffer *init_read_bit_buffer(
  rb->error_handler = error_handler;
  rb->error_handler_data = &pbi->common;
  if (pbi->decrypt_cb) {
-    const int n = (int)MIN(MAX_VP9_HEADER_SIZE, data_end - data);
+    const int n = (int)VPXMIN(MAX_VP9_HEADER_SIZE, data_end - data);
    pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n);
    rb->bit_buffer = clear_data;
    rb->bit_buffer_end = clear_data + n;
@ -2174,10 +2250,11 @@ void vp10_decode_frame(VP10Decoder *pbi,

  // If encoded in frame parallel mode, frame context is ready after decoding
  // the frame header.
-  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+  if (cm->frame_parallel_decode &&
+      cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD) {
    VPxWorker *const worker = pbi->frame_worker_owner;
    FrameWorkerData *const frame_worker_data = worker->data1;
-    if (cm->refresh_frame_context) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
      context_updated = 1;
      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
    }
@ -2211,7 +2288,7 @@ void vp10_decode_frame(VP10Decoder *pbi,
  }

  if (!xd->corrupted) {
-    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
      vp10_adapt_coef_probs(cm);

      if (!frame_is_intra_only(cm)) {
@ -2227,6 +2304,7 @@ void vp10_decode_frame(VP10Decoder *pbi,
  }

  // Non frame parallel update frame context here.
-  if (cm->refresh_frame_context && !context_updated)
+  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF &&
+      !context_updated)
    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 }
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@ -22,6 +22,8 @@
 #include "vp10/decoder/decodemv.h"
 #include "vp10/decoder/decodeframe.h"

+#include "vpx_dsp/vpx_dsp_common.h"
+
 static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
  return (PREDICTION_MODE)vpx_read_tree(r, vp10_intra_mode_tree, p);
 }
@ -87,7 +89,7 @@ static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
    return read_selected_tx_size(cm, xd, max_tx_size, r);
  else
-    return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+    return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }

 static int dec_get_segment_id(const VP10_COMMON *cm, const uint8_t *segment_ids,
@ -96,8 +98,8 @@ static int dec_get_segment_id(const VP10_COMMON *cm, const uint8_t *segment_ids,

  for (y = 0; y < y_mis; y++)
    for (x = 0; x < x_mis; x++)
-      segment_id = MIN(segment_id,
-                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);

  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
  return segment_id;
@ -114,6 +116,22 @@ static void set_segment_id(VP10_COMMON *cm, int mi_offset,
      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }

+static int read_intra_segment_id(VP10_COMMON *const cm, int mi_offset,
+                                 int x_mis, int y_mis,
+                                 vpx_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+  int segment_id;
+
+  if (!seg->enabled)
+    return 0;  // Default for disabled segmentation
+
+  assert(seg->update_map && !seg->temporal_update);
+
+  segment_id = read_segment_id(r, seg);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
 static void copy_segment_id(const VP10_COMMON *cm,
                           const uint8_t *last_segment_ids,
                           uint8_t *current_segment_ids,
@ -126,26 +144,6 @@ static void copy_segment_id(const VP10_COMMON *cm,
          last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
 }

-static int read_intra_segment_id(VP10_COMMON *const cm, int mi_offset,
-                                 int x_mis, int y_mis,
-                                 vpx_reader *r) {
-  struct segmentation *const seg = &cm->seg;
-  int segment_id;
-
-  if (!seg->enabled)
-    return 0;  // Default for disabled segmentation
-
-  if (!seg->update_map) {
-    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
-                    mi_offset, x_mis, y_mis);
-    return 0;
-  }
-
-  segment_id = read_segment_id(r, seg);
-  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
-  return segment_id;
-}
-
 static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
                                 int mi_row, int mi_col, vpx_reader *r) {
  struct segmentation *const seg = &cm->seg;
@ -156,8 +154,8 @@ static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
  const int bh = xd->plane[0].n4_h >> 1;

  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);

  if (!seg->enabled)
    return 0;  // Default for disabled segmentation
@ -212,8 +210,8 @@ static void read_intra_frame_mode_info(VP10_COMMON *const cm,
  const int bh = xd->plane[0].n4_h >> 1;

  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);

  mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
@ -296,7 +294,7 @@ static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
  if (mv_joint_horizontal(joint_type))
    diff.col = read_mv_component(r, &ctx->comps[1], use_hp);

-  vp10_inc_mv(&diff, counts);
+  vp10_inc_mv(&diff, counts, use_hp);

  mv->row = ref->row + diff.row;
  mv->col = ref->col + diff.col;
@ -604,11 +602,12 @@ static void read_inter_frame_mode_info(VP10Decoder *const pbi,
        mbmi->sb_type >= BLOCK_8X8 &&
        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
        !mbmi->skip) {
+      FRAME_COUNTS *counts = xd->counts;
      mbmi->ext_txfrm = vpx_read_tree(r,
                                      vp10_ext_tx_tree,
                                      cm->fc->ext_tx_prob[mbmi->tx_size]);
-      if (!cm->frame_parallel_decoding_mode)
-        ++cm->counts.ext_tx[mbmi->tx_size][mbmi->ext_txfrm];
+      if (counts)
+        ++counts->ext_tx[mbmi->tx_size][mbmi->ext_txfrm];
    } else {
      mbmi->ext_txfrm = NORM;
    }
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@ -258,7 +258,7 @@ static void swap_frame_buffers(VP10Decoder *pbi) {
  pbi->hold_ref_buf = 0;
  cm->frame_to_show = get_frame_new_buffer(cm);

-  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+  if (!cm->frame_parallel_decode || !cm->show_frame) {
    lock_buffer_pool(pool);
    --frame_bufs[cm->new_fb_idx].ref_count;
    unlock_buffer_pool(pool);
@ -297,7 +297,7 @@ int vp10_receive_compressed_data(VP10Decoder *pbi,

  // Check if the previous frame was a frame without any references to it.
  // Release frame buffer if not decoding in frame parallel mode.
-  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+  if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0
      && frame_bufs[cm->new_fb_idx].ref_count == 0)
    pool->release_fb_cb(pool->cb_priv,
                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
@ -310,7 +310,7 @@ int vp10_receive_compressed_data(VP10Decoder *pbi,
  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];

  pbi->hold_ref_buf = 0;
-  if (pbi->frame_parallel_decode) {
+  if (cm->frame_parallel_decode) {
    VPxWorker *const worker = pbi->frame_worker_owner;
    vp10_frameworker_lock_stats(worker);
    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
@ -379,12 +379,12 @@ int vp10_receive_compressed_data(VP10Decoder *pbi,
  if (!cm->show_existing_frame) {
    cm->last_show_frame = cm->show_frame;
    cm->prev_frame = cm->cur_frame;
-    if (cm->seg.enabled && !pbi->frame_parallel_decode)
+    if (cm->seg.enabled && !cm->frame_parallel_decode)
      vp10_swap_current_and_last_seg_map(cm);
  }

  // Update progress in frame parallel decode.
-  if (pbi->frame_parallel_decode) {
+  if (cm->frame_parallel_decode) {
    // Need to lock the mutex here as another thread may
    // be accessing this buffer.
    VPxWorker *const worker = pbi->frame_worker_owner;
--- a/vp10/decoder/decoder.h
+++ b/vp10/decoder/decoder.h
@ -55,8 +55,6 @@ typedef struct VP10Decoder {

  int refresh_frame_flags;

-  int frame_parallel_decode;  // frame-based threading.
-
  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
  // the same.
  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
--- a/vp10/decoder/dthread.h
+++ b/vp10/decoder/dthread.h
@ -15,6 +15,10 @@
 #include "vpx_util/vpx_thread.h"
 #include "vpx/internal/vpx_codec_internal.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP10Common;
 struct VP10Decoder;

@ -63,4 +67,8 @@ void vp10_frameworker_broadcast(RefCntBuffer *const buf, int row);
 void vp10_frameworker_copy_context(VPxWorker *const dst_worker,
                                  VPxWorker *const src_worker);

+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
 #endif  // VP10_DECODER_DTHREAD_H_
--- a/vp10/encoder/aq_complexity.c
+++ b/vp10/encoder/aq_complexity.c
@ -16,6 +16,7 @@
 #include "vp10/encoder/encodeframe.h"
 #include "vp10/common/seg_common.h"
 #include "vp10/encoder/segmentation.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/system_state.h"

 #define AQ_C_SEGMENTS  5
@ -117,8 +118,8 @@ void vp10_caq_select_segment(VP10_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
  const int mi_offset = mi_row * cm->mi_cols + mi_col;
  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-  const int xmis = MIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
-  const int ymis = MIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
  int x, y;
  int i;
  unsigned char segment;
@ -136,7 +137,7 @@ void vp10_caq_select_segment(VP10_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,

    vpx_clear_system_state();
    low_var_thresh = (cpi->oxcf.pass == 2)
-      ? MAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+      ? VPXMAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
      : DEFAULT_LV_THRESH;

    vp10_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
--- a/vp10/encoder/aq_cyclicrefresh.c
+++ b/vp10/encoder/aq_cyclicrefresh.c
@ -15,6 +15,7 @@
 #include "vp10/encoder/aq_cyclicrefresh.h"
 #include "vp10/encoder/ratectrl.h"
 #include "vp10/encoder/segmentation.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/system_state.h"

 struct CYCLIC_REFRESH {
@ -220,8 +221,8 @@ void vp10_cyclic_refresh_update_segment(VP10_COMP *const cpi,
  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
  const int bw = num_8x8_blocks_wide_lookup[bsize];
  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
  const int block_index = mi_row * cm->mi_cols + mi_col;
  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
                                                      bsize);
@ -291,7 +292,7 @@ void vp10_cyclic_refresh_postencode(VP10_COMP *const cpi) {
    }
 }

-// Set golden frame update interval, for non-svc 1 pass CBR mode.
+// Set golden frame update interval, for 1 pass CBR mode.
 void vp10_cyclic_refresh_set_golden_update(VP10_COMP *const cpi) {
  RATE_CONTROL *const rc = &cpi->rc;
  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
@ -413,10 +414,10 @@ static void cyclic_refresh_update_map(VP10_COMP *const cpi) {
    assert(mi_col >= 0 && mi_col < cm->mi_cols);
    bl_index = mi_row * cm->mi_cols + mi_col;
    // Loop through all 8x8 blocks in superblock and update map.
-    xmis = MIN(cm->mi_cols - mi_col,
-               num_8x8_blocks_wide_lookup[BLOCK_64X64]);
-    ymis = MIN(cm->mi_rows - mi_row,
-               num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    xmis =
+        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+    ymis =
+        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
    for (y = 0; y < ymis; y++) {
      for (x = 0; x < xmis; x++) {
        const int bl_index2 = bl_index + y * cm->mi_cols + x;
@ -484,10 +485,7 @@ void vp10_cyclic_refresh_setup(VP10_COMP *const cpi) {
  if (cm->current_video_frame == 0)
    cr->low_content_avg = 0.0;
  // Don't apply refresh on key frame or enhancement layer frames.
-  if (!apply_cyclic_refresh ||
-      (cm->frame_type == KEY_FRAME) ||
-      (cpi->svc.temporal_layer_id > 0) ||
-      (cpi->svc.spatial_layer_id > 0)) {
+  if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
    // Set segmentation map to 0 and disable.
    unsigned char *const seg_map = cpi->segmentation_map;
    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
@ -545,8 +543,9 @@ void vp10_cyclic_refresh_setup(VP10_COMP *const cpi) {

    // Set a more aggressive (higher) q delta for segment BOOST2.
    qindex_delta = compute_deltaq(
-        cpi, cm->base_qindex, MIN(CR_MAX_RATE_TARGET_RATIO,
-        0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+        cpi, cm->base_qindex,
+        VPXMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
    cr->qindex_delta[2] = qindex_delta;
    vp10_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);

--- a/vp10/encoder/aq_cyclicrefresh.h
+++ b/vp10/encoder/aq_cyclicrefresh.h
@ -61,7 +61,7 @@ void vp10_cyclic_refresh_update__map(struct VP10_COMP *const cpi);
 // Update the actual number of blocks that were applied the segment delta q.
 void vp10_cyclic_refresh_postencode(struct VP10_COMP *const cpi);

-// Set golden frame update interval, for non-svc 1 pass CBR mode.
+// Set golden frame update interval, for 1 pass CBR mode.
 void vp10_cyclic_refresh_set_golden_update(struct VP10_COMP *const cpi);

 // Check if we should not update golden reference, based on past refresh stats.
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@ -14,6 +14,7 @@

 #include "vpx/vpx_encoder.h"
 #include "vpx_dsp/bitwriter_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/system_state.h"
@ -776,8 +777,7 @@ static void encode_loopfilter(struct loopfilter *lf,
        vpx_wb_write_bit(wb, changed);
        if (changed) {
          lf->last_ref_deltas[i] = delta;
-          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vpx_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_inv_signed_literal(wb, delta, 6);
        }
      }

@ -787,8 +787,7 @@ static void encode_loopfilter(struct loopfilter *lf,
        vpx_wb_write_bit(wb, changed);
        if (changed) {
          lf->last_mode_deltas[i] = delta;
-          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vpx_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_inv_signed_literal(wb, delta, 6);
        }
      }
    }
@ -798,8 +797,7 @@ static void encode_loopfilter(struct loopfilter *lf,
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
  if (delta_q != 0) {
    vpx_wb_write_bit(wb, 1);
-    vpx_wb_write_literal(wb, abs(delta_q), 4);
-    vpx_wb_write_bit(wb, delta_q < 0);
+    vpx_wb_write_inv_signed_literal(wb, delta_q, 4);
  } else {
    vpx_wb_write_bit(wb, 0);
  }
@ -824,7 +822,11 @@ static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
    return;

  // Segmentation map
-  vpx_wb_write_bit(wb, seg->update_map);
+  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+    vpx_wb_write_bit(wb, seg->update_map);
+  } else {
+    assert(seg->update_map == 1);
+  }
  if (seg->update_map) {
    // Select the coding strategy (temporal or spatial)
    vp10_choose_segmap_coding_method(cm, xd);
@ -838,7 +840,11 @@ static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
    }

    // Write out the chosen coding method.
-    vpx_wb_write_bit(wb, seg->temporal_update);
+    if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+      vpx_wb_write_bit(wb, seg->temporal_update);
+    } else {
+      assert(seg->temporal_update == 0);
+    }
    if (seg->temporal_update) {
      for (i = 0; i < PREDICTION_PROBS; i++) {
        const int prob = seg->pred_probs[i];
@ -875,14 +881,25 @@ static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
  }
 }

-static void encode_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
+#if CONFIG_MISC_FIXES
+static void write_txfm_mode(TX_MODE mode, struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_bit(wb, mode == TX_MODE_SELECT);
+  if (mode != TX_MODE_SELECT)
+    vpx_wb_write_literal(wb, mode, 2);
+}
+#endif
+
+static void update_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
                              FRAME_COUNTS *counts) {
+#if !CONFIG_MISC_FIXES
  // Mode
-  vpx_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
+  vpx_write_literal(w, VPXMIN(cm->tx_mode, ALLOW_32X32), 2);
  if (cm->tx_mode >= ALLOW_32X32)
    vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);

  // Probabilities
+#endif
+
  if (cm->tx_mode == TX_MODE_SELECT) {
    int i, j;
    unsigned int ct_8x8p[TX_SIZES - 3][2];
@ -914,11 +931,9 @@ static void encode_txfm_probs(VP10_COMMON *cm, vpx_writer *w,

 static void write_interp_filter(INTERP_FILTER filter,
                                struct vpx_write_bit_buffer *wb) {
-  const int filter_to_literal[] = { 1, 0, 2, 3 };
-
  vpx_wb_write_bit(wb, filter == SWITCHABLE);
  if (filter != SWITCHABLE)
-    vpx_wb_write_literal(wb, filter_to_literal[filter], 2);
+    vpx_wb_write_literal(wb, filter, 2);
 }

 static void fix_interp_filter(VP10_COMMON *cm, FRAME_COUNTS *counts) {
@ -1059,18 +1074,7 @@ static void write_frame_size_with_refs(VP10_COMP *cpi,
  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);

-    // Set "found" to 0 for temporal svc and for spatial svc key frame
-    if (cpi->use_svc &&
-        ((cpi->svc.number_temporal_layers > 1 &&
-         cpi->oxcf.rc_mode == VPX_CBR) ||
-        (cpi->svc.number_spatial_layers > 1 &&
-         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) ||
-        (is_two_pass_svc(cpi) &&
-         cpi->svc.encode_empty_frame_state == ENCODING &&
-         cpi->svc.layer_context[0].frames_from_key_frame <
-         cpi->svc.number_temporal_layers + 1))) {
-      found = 0;
-    } else if (cfg != NULL) {
+    if (cfg != NULL) {
      found = cm->width == cfg->y_crop_width &&
              cm->height == cfg->y_crop_height;
    }
@ -1122,7 +1126,8 @@ static void write_bitdepth_colorspace_sampling(
  }
  vpx_wb_write_literal(wb, cm->color_space, 3);
  if (cm->color_space != VPX_CS_SRGB) {
-    vpx_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    vpx_wb_write_bit(wb, cm->color_range);
    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
      assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
      vpx_wb_write_bit(wb, cm->subsampling_x);
@ -1156,19 +1161,28 @@ static void write_uncompressed_header(VP10_COMP *cpi,
    write_bitdepth_colorspace_sampling(cm, wb);
    write_frame_size(cm, wb);
  } else {
-    // In spatial svc if it's not error_resilient_mode then we need to code all
-    // visible frames as invisible. But we need to keep the show_frame flag so
-    // that the publisher could know whether it is supposed to be visible.
-    // So we will code the show_frame flag as it is. Then code the intra_only
-    // bit here. This will make the bitstream incompatible. In the player we
-    // will change to show_frame flag to 0, then add an one byte frame with
-    // show_existing_frame flag which tells the decoder which frame we want to
-    // show.
    if (!cm->show_frame)
      vpx_wb_write_bit(wb, cm->intra_only);

-    if (!cm->error_resilient_mode)
-      vpx_wb_write_literal(wb, cm->reset_frame_context, 2);
+    if (!cm->error_resilient_mode) {
+#if CONFIG_MISC_FIXES
+      if (cm->intra_only) {
+        vpx_wb_write_bit(wb,
+                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      } else {
+        vpx_wb_write_bit(wb,
+                         cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
+        if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
+          vpx_wb_write_bit(wb,
+                           cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      }
+#else
+      static const int reset_frame_context_conv_tbl[3] = { 0, 2, 3 };
+
+      vpx_wb_write_literal(wb,
+          reset_frame_context_conv_tbl[cm->reset_frame_context], 2);
+#endif
+    }

    if (cm->intra_only) {
      write_sync_code(wb);
@ -1200,8 +1214,13 @@ static void write_uncompressed_header(VP10_COMP *cpi,
  }

  if (!cm->error_resilient_mode) {
-    vpx_wb_write_bit(wb, cm->refresh_frame_context);
-    vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
+    vpx_wb_write_bit(wb,
+                     cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF);
+#if CONFIG_MISC_FIXES
+    if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
+#endif
+      vpx_wb_write_bit(wb, cm->refresh_frame_context !=
+                               REFRESH_FRAME_CONTEXT_BACKWARD);
  }

  vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
@ -1209,24 +1228,32 @@ static void write_uncompressed_header(VP10_COMP *cpi,
  encode_loopfilter(&cm->lf, wb);
  encode_quantization(cm, wb);
  encode_segmentation(cm, xd, wb);
+#if CONFIG_MISC_FIXES
+  if (xd->lossless)
+    cm->tx_mode = TX_4X4;
+  else
+    write_txfm_mode(cm->tx_mode, wb);
+#endif

  write_tile_info(cm, wb);
 }

 static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
  VP10_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  FRAME_CONTEXT *const fc = cm->fc;
  FRAME_COUNTS *counts = cpi->td.counts;
  vpx_writer header_bc;

  vpx_start_encode(&header_bc, data);

-  if (xd->lossless)
-    cm->tx_mode = ONLY_4X4;
+#if !CONFIG_MISC_FIXES
+  if (cpi->td.mb.e_mbd.lossless)
+    cm->tx_mode = TX_4X4;
  else
-    encode_txfm_probs(cm, &header_bc, counts);
-
+    update_txfm_probs(cm, &header_bc, counts);
+#else
+  update_txfm_probs(cm, &header_bc, counts);
+#endif
  update_coef_probs(cpi, &header_bc);
  update_skip_probs(cm, &header_bc, counts);

--- a/vp10/encoder/bitstream.h
+++ b/vp10/encoder/bitstream.h
@ -24,12 +24,7 @@ void vp10_encode_token_init();

 static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) {
  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
-         cpi->rc.is_src_frame_alt_ref &&
-         (!cpi->use_svc ||      // Add spatial svc base layer case here
-          (is_two_pass_svc(cpi) &&
-           cpi->svc.spatial_layer_id == 0 &&
-           cpi->svc.layer_context[0].gold_ref_idx >=0 &&
-           cpi->oxcf.ss_enable_auto_arf[0]));
+         cpi->rc.is_src_frame_alt_ref;
 }

 #ifdef __cplusplus
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@ -115,7 +115,6 @@ struct macroblock {

  // indicate if it is in the rd search loop or encoding process
  int use_lp32x32fdct;
-  int skip_encode;

  // use fast quantization process
  int quant_fp;
--- a/vp10/encoder/context_tree.h
+++ b/vp10/encoder/context_tree.h
@ -14,6 +14,10 @@
 #include "vp10/common/blockd.h"
 #include "vp10/encoder/block.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP10_COMP;
 struct VP10Common;
 struct ThreadData;
@ -84,4 +88,8 @@ typedef struct PC_TREE {
 void vp10_setup_pc_tree(struct VP10Common *cm, struct ThreadData *td);
 void vp10_free_pc_tree(struct ThreadData *td);

+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif /* VP10_ENCODER_CONTEXT_TREE_H_ */
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@ -98,216 +98,704 @@ void fdst16(const tran_low_t *input, tran_low_t *output) {
 }
 #endif  // CONFIG_EXT_TX

+static INLINE void range_check(const tran_low_t *input, const int size,
+                               const int bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  int i;
+  for (i = 0; i < size; ++i) {
+    assert(abs(input[i]) < (1 << bit));
+  }
+#else
+  (void)input;
+  (void)size;
+  (void)bit;
+#endif
+}
+
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t step[4];
-  tran_high_t temp1, temp2;
+  tran_high_t temp;
+  tran_low_t step[4];

-  step[0] = input[0] + input[3];
-  step[1] = input[1] + input[2];
-  step[2] = input[1] - input[2];
-  step[3] = input[0] - input[3];
+  // stage 0
+  range_check(input, 4, 11);

-  temp1 = (step[0] + step[1]) * cospi_16_64;
-  temp2 = (step[0] - step[1]) * cospi_16_64;
-  output[0] = (tran_low_t)fdct_round_shift(temp1);
-  output[2] = (tran_low_t)fdct_round_shift(temp2);
-  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-  output[1] = (tran_low_t)fdct_round_shift(temp1);
-  output[3] = (tran_low_t)fdct_round_shift(temp2);
+  // stage 1
+  output[0] = input[0] + input[3];
+  output[1] = input[1] + input[2];
+  output[2] = input[1] - input[2];
+  output[3] = input[0] - input[3];
+
+  range_check(output, 4, 12);
+
+  // stage 2
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+  step[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+  step[3] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 4, 13);
+
+  // stage 3
+  output[0] = step[0];
+  output[1] = step[2];
+  output[2] = step[1];
+  output[3] = step[3];
+
+  range_check(output, 4, 13);
 }

 static void fdct8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-  tran_high_t t0, t1, t2, t3;                  // needs32
-  tran_high_t x0, x1, x2, x3;                  // canbe16
+  tran_high_t temp;
+  tran_low_t step[8];
+
+  // stage 0
+  range_check(input, 8, 12);

  // stage 1
-  s0 = input[0] + input[7];
-  s1 = input[1] + input[6];
-  s2 = input[2] + input[5];
-  s3 = input[3] + input[4];
-  s4 = input[3] - input[4];
-  s5 = input[2] - input[5];
-  s6 = input[1] - input[6];
-  s7 = input[0] - input[7];
+  output[0] = input[0] + input[7];
+  output[1] = input[1] + input[6];
+  output[2] = input[2] + input[5];
+  output[3] = input[3] + input[4];
+  output[4] = input[3] - input[4];
+  output[5] = input[2] - input[5];
+  output[6] = input[1] - input[6];
+  output[7] = input[0] - input[7];

-  // fdct4(step, step);
-  x0 = s0 + s3;
-  x1 = s1 + s2;
-  x2 = s1 - s2;
-  x3 = s0 - s3;
-  t0 = (x0 + x1) * cospi_16_64;
-  t1 = (x0 - x1) * cospi_16_64;
-  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[0] = (tran_low_t)fdct_round_shift(t0);
-  output[2] = (tran_low_t)fdct_round_shift(t2);
-  output[4] = (tran_low_t)fdct_round_shift(t1);
-  output[6] = (tran_low_t)fdct_round_shift(t3);
+  range_check(output, 8, 13);

-  // Stage 2
-  t0 = (s6 - s5) * cospi_16_64;
-  t1 = (s6 + s5) * cospi_16_64;
-  t2 = (tran_low_t)fdct_round_shift(t0);
-  t3 = (tran_low_t)fdct_round_shift(t1);
+  // stage 2
+  step[0] = output[0] + output[3];
+  step[1] = output[1] + output[2];
+  step[2] = output[1] - output[2];
+  step[3] = output[0] - output[3];
+  step[4] = output[4];
+  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  step[7] = output[7];

-  // Stage 3
-  x0 = s4 + t2;
-  x1 = s4 - t2;
-  x2 = s7 - t3;
-  x3 = s7 + t3;
+  range_check(step, 8, 14);

-  // Stage 4
-  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[1] = (tran_low_t)fdct_round_shift(t0);
-  output[3] = (tran_low_t)fdct_round_shift(t2);
-  output[5] = (tran_low_t)fdct_round_shift(t1);
-  output[7] = (tran_low_t)fdct_round_shift(t3);
+  // stage 3
+  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  output[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+  output[3] = (tran_low_t)fdct_round_shift(temp);
+  output[4] = step[4] + step[5];
+  output[5] = step[4] - step[5];
+  output[6] = step[7] - step[6];
+  output[7] = step[7] + step[6];
+
+  range_check(output, 8, 14);
+
+  // stage 4
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+  step[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+  step[7] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 8, 14);
+
+  // stage 5
+  output[0] = step[0];
+  output[1] = step[4];
+  output[2] = step[2];
+  output[3] = step[6];
+  output[4] = step[1];
+  output[5] = step[5];
+  output[6] = step[3];
+  output[7] = step[7];
+
+  range_check(output, 8, 14);
 }

-static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
-  tran_high_t step1[8];      // canbe16
-  tran_high_t step2[8];      // canbe16
-  tran_high_t step3[8];      // canbe16
-  tran_high_t input[8];      // canbe16
-  tran_high_t temp1, temp2;  // needs32
+static void fdct16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[16];

-  // step 1
-  input[0] = in[0] + in[15];
-  input[1] = in[1] + in[14];
-  input[2] = in[2] + in[13];
-  input[3] = in[3] + in[12];
-  input[4] = in[4] + in[11];
-  input[5] = in[5] + in[10];
-  input[6] = in[6] + in[ 9];
-  input[7] = in[7] + in[ 8];
+  // stage 0
+  range_check(input, 16, 13);

-  step1[0] = in[7] - in[ 8];
-  step1[1] = in[6] - in[ 9];
-  step1[2] = in[5] - in[10];
-  step1[3] = in[4] - in[11];
-  step1[4] = in[3] - in[12];
-  step1[5] = in[2] - in[13];
-  step1[6] = in[1] - in[14];
-  step1[7] = in[0] - in[15];
+  // stage 1
+  output[0] = input[0] + input[15];
+  output[1] = input[1] + input[14];
+  output[2] = input[2] + input[13];
+  output[3] = input[3] + input[12];
+  output[4] = input[4] + input[11];
+  output[5] = input[5] + input[10];
+  output[6] = input[6] + input[9];
+  output[7] = input[7] + input[8];
+  output[8] = input[7] - input[8];
+  output[9] = input[6] - input[9];
+  output[10] = input[5] - input[10];
+  output[11] = input[4] - input[11];
+  output[12] = input[3] - input[12];
+  output[13] = input[2] - input[13];
+  output[14] = input[1] - input[14];
+  output[15] = input[0] - input[15];

-  // fdct8(step, step);
-  {
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
+  range_check(output, 16, 14);

-    // stage 1
-    s0 = input[0] + input[7];
-    s1 = input[1] + input[6];
-    s2 = input[2] + input[5];
-    s3 = input[3] + input[4];
-    s4 = input[3] - input[4];
-    s5 = input[2] - input[5];
-    s6 = input[1] - input[6];
-    s7 = input[0] - input[7];
+  // stage 2
+  step[0] = output[0] + output[7];
+  step[1] = output[1] + output[6];
+  step[2] = output[2] + output[5];
+  step[3] = output[3] + output[4];
+  step[4] = output[3] - output[4];
+  step[5] = output[2] - output[5];
+  step[6] = output[1] - output[6];
+  step[7] = output[0] - output[7];
+  step[8] = output[8];
+  step[9] = output[9];
+  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  step[14] = output[14];
+  step[15] = output[15];

-    // fdct4(step, step);
-    x0 = s0 + s3;
-    x1 = s1 + s2;
-    x2 = s1 - s2;
-    x3 = s0 - s3;
-    t0 = (x0 + x1) * cospi_16_64;
-    t1 = (x0 - x1) * cospi_16_64;
-    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
-    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = (tran_low_t)fdct_round_shift(t0);
-    out[4] = (tran_low_t)fdct_round_shift(t2);
-    out[8] = (tran_low_t)fdct_round_shift(t1);
-    out[12] = (tran_low_t)fdct_round_shift(t3);
+  range_check(step, 16, 15);

-    // Stage 2
-    t0 = (s6 - s5) * cospi_16_64;
-    t1 = (s6 + s5) * cospi_16_64;
-    t2 = fdct_round_shift(t0);
-    t3 = fdct_round_shift(t1);
+  // stage 3
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = step[1] - step[2];
+  output[3] = step[0] - step[3];
+  output[4] = step[4];
+  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = step[9] - step[10];
+  output[11] = step[8] - step[11];
+  output[12] = step[15] - step[12];
+  output[13] = step[14] - step[13];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];

-    // Stage 3
-    x0 = s4 + t2;
-    x1 = s4 - t2;
-    x2 = s7 - t3;
-    x3 = s7 + t3;
+  range_check(output, 16, 16);

-    // Stage 4
-    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    out[2] = (tran_low_t)fdct_round_shift(t0);
-    out[6] = (tran_low_t)fdct_round_shift(t2);
-    out[10] = (tran_low_t)fdct_round_shift(t1);
-    out[14] = (tran_low_t)fdct_round_shift(t3);
-  }
+  // stage 4
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+  step[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+  step[3] = (tran_low_t)fdct_round_shift(temp);
+  step[4] = output[4] + output[5];
+  step[5] = output[4] - output[5];
+  step[6] = output[7] - output[6];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  step[11] = output[11];
+  step[12] = output[12];
+  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  step[15] = output[15];

-  // step 2
-  temp1 = (step1[5] - step1[2]) * cospi_16_64;
-  temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = fdct_round_shift(temp1);
-  step2[3] = fdct_round_shift(temp2);
-  temp1 = (step1[4] + step1[3]) * cospi_16_64;
-  temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = fdct_round_shift(temp1);
-  step2[5] = fdct_round_shift(temp2);
+  range_check(step, 16, 16);

-  // step 3
-  step3[0] = step1[0] + step2[3];
-  step3[1] = step1[1] + step2[2];
-  step3[2] = step1[1] - step2[2];
-  step3[3] = step1[0] - step2[3];
-  step3[4] = step1[7] - step2[4];
-  step3[5] = step1[6] - step2[5];
-  step3[6] = step1[6] + step2[5];
-  step3[7] = step1[7] + step2[4];
+  // stage 5
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
+  output[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
+  output[7] = (tran_low_t)fdct_round_shift(temp);
+  output[8] = step[8] + step[9];
+  output[9] = step[8] - step[9];
+  output[10] = step[11] - step[10];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = step[12] - step[13];
+  output[14] = step[15] - step[14];
+  output[15] = step[15] + step[14];

-  // step 4
-  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
-  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
-  step2[1] = fdct_round_shift(temp1);
-  step2[2] = fdct_round_shift(temp2);
-  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-  step2[5] = fdct_round_shift(temp1);
-  step2[6] = fdct_round_shift(temp2);
+  range_check(output, 16, 16);

-  // step 5
-  step1[0] = step3[0] + step2[1];
-  step1[1] = step3[0] - step2[1];
-  step1[2] = step3[3] + step2[2];
-  step1[3] = step3[3] - step2[2];
-  step1[4] = step3[4] - step2[5];
-  step1[5] = step3[4] + step2[5];
-  step1[6] = step3[7] - step2[6];
-  step1[7] = step3[7] + step2[6];
+  // stage 6
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
+  step[8] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
+  step[15] = (tran_low_t)fdct_round_shift(temp);

-  // step 6
-  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
-  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = (tran_low_t)fdct_round_shift(temp1);
-  out[9] = (tran_low_t)fdct_round_shift(temp2);
+  range_check(step, 16, 16);

-  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = (tran_low_t)fdct_round_shift(temp1);
-  out[13] = (tran_low_t)fdct_round_shift(temp2);
+  // stage 7
+  output[0] = step[0];
+  output[1] = step[8];
+  output[2] = step[4];
+  output[3] = step[12];
+  output[4] = step[2];
+  output[5] = step[10];
+  output[6] = step[6];
+  output[7] = step[14];
+  output[8] = step[1];
+  output[9] = step[9];
+  output[10] = step[5];
+  output[11] = step[13];
+  output[12] = step[3];
+  output[13] = step[11];
+  output[14] = step[7];
+  output[15] = step[15];

-  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
-  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = (tran_low_t)fdct_round_shift(temp1);
-  out[11] = (tran_low_t)fdct_round_shift(temp2);
+  range_check(output, 16, 16);
+}

-  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = (tran_low_t)fdct_round_shift(temp1);
-  out[15] = (tran_low_t)fdct_round_shift(temp2);
+static void fdct32(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[32];
+
+  // stage 0
+  range_check(input, 32, 14);
+
+  // stage 1
+  output[0] = input[0] + input[31];
+  output[1] = input[1] + input[30];
+  output[2] = input[2] + input[29];
+  output[3] = input[3] + input[28];
+  output[4] = input[4] + input[27];
+  output[5] = input[5] + input[26];
+  output[6] = input[6] + input[25];
+  output[7] = input[7] + input[24];
+  output[8] = input[8] + input[23];
+  output[9] = input[9] + input[22];
+  output[10] = input[10] + input[21];
+  output[11] = input[11] + input[20];
+  output[12] = input[12] + input[19];
+  output[13] = input[13] + input[18];
+  output[14] = input[14] + input[17];
+  output[15] = input[15] + input[16];
+  output[16] = input[15] - input[16];
+  output[17] = input[14] - input[17];
+  output[18] = input[13] - input[18];
+  output[19] = input[12] - input[19];
+  output[20] = input[11] - input[20];
+  output[21] = input[10] - input[21];
+  output[22] = input[9] - input[22];
+  output[23] = input[8] - input[23];
+  output[24] = input[7] - input[24];
+  output[25] = input[6] - input[25];
+  output[26] = input[5] - input[26];
+  output[27] = input[4] - input[27];
+  output[28] = input[3] - input[28];
+  output[29] = input[2] - input[29];
+  output[30] = input[1] - input[30];
+  output[31] = input[0] - input[31];
+
+  range_check(output, 32, 15);
+
+  // stage 2
+  step[0] = output[0] + output[15];
+  step[1] = output[1] + output[14];
+  step[2] = output[2] + output[13];
+  step[3] = output[3] + output[12];
+  step[4] = output[4] + output[11];
+  step[5] = output[5] + output[10];
+  step[6] = output[6] + output[9];
+  step[7] = output[7] + output[8];
+  step[8] = output[7] - output[8];
+  step[9] = output[6] - output[9];
+  step[10] = output[5] - output[10];
+  step[11] = output[4] - output[11];
+  step[12] = output[3] - output[12];
+  step[13] = output[2] - output[13];
+  step[14] = output[1] - output[14];
+  step[15] = output[0] - output[15];
+  step[16] = output[16];
+  step[17] = output[17];
+  step[18] = output[18];
+  step[19] = output[19];
+  temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
+  step[23] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
+  step[24] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  step[28] = output[28];
+  step[29] = output[29];
+  step[30] = output[30];
+  step[31] = output[31];
+
+  range_check(step, 32, 16);
+
+  // stage 3
+  output[0] = step[0] + step[7];
+  output[1] = step[1] + step[6];
+  output[2] = step[2] + step[5];
+  output[3] = step[3] + step[4];
+  output[4] = step[3] - step[4];
+  output[5] = step[2] - step[5];
+  output[6] = step[1] - step[6];
+  output[7] = step[0] - step[7];
+  output[8] = step[8];
+  output[9] = step[9];
+  temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
+  output[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
+  output[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  output[14] = step[14];
+  output[15] = step[15];
+  output[16] = step[16] + step[23];
+  output[17] = step[17] + step[22];
+  output[18] = step[18] + step[21];
+  output[19] = step[19] + step[20];
+  output[20] = step[19] - step[20];
+  output[21] = step[18] - step[21];
+  output[22] = step[17] - step[22];
+  output[23] = step[16] - step[23];
+  output[24] = step[31] - step[24];
+  output[25] = step[30] - step[25];
+  output[26] = step[29] - step[26];
+  output[27] = step[28] - step[27];
+  output[28] = step[28] + step[27];
+  output[29] = step[29] + step[26];
+  output[30] = step[30] + step[25];
+  output[31] = step[31] + step[24];
+
+  range_check(output, 32, 17);
+
+  // stage 4
+  step[0] = output[0] + output[3];
+  step[1] = output[1] + output[2];
+  step[2] = output[1] - output[2];
+  step[3] = output[0] - output[3];
+  step[4] = output[4];
+  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  step[7] = output[7];
+  step[8] = output[8] + output[11];
+  step[9] = output[9] + output[10];
+  step[10] = output[9] - output[10];
+  step[11] = output[8] - output[11];
+  step[12] = output[15] - output[12];
+  step[13] = output[14] - output[13];
+  step[14] = output[14] + output[13];
+  step[15] = output[15] + output[12];
+  step[16] = output[16];
+  step[17] = output[17];
+  temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
+  step[19] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  step[22] = output[22];
+  step[23] = output[23];
+  step[24] = output[24];
+  step[25] = output[25];
+  temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
+  step[28] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  step[30] = output[30];
+  step[31] = output[31];
+
+  range_check(step, 32, 18);
+
+  // stage 5
+  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  output[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+  output[3] = (tran_low_t)fdct_round_shift(temp);
+  output[4] = step[4] + step[5];
+  output[5] = step[4] - step[5];
+  output[6] = step[7] - step[6];
+  output[7] = step[7] + step[6];
+  output[8] = step[8];
+  temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
+  output[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  output[11] = step[11];
+  output[12] = step[12];
+  temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
+  output[14] = (tran_low_t)fdct_round_shift(temp);
+  output[15] = step[15];
+  output[16] = step[16] + step[19];
+  output[17] = step[17] + step[18];
+  output[18] = step[17] - step[18];
+  output[19] = step[16] - step[19];
+  output[20] = step[23] - step[20];
+  output[21] = step[22] - step[21];
+  output[22] = step[22] + step[21];
+  output[23] = step[23] + step[20];
+  output[24] = step[24] + step[27];
+  output[25] = step[25] + step[26];
+  output[26] = step[25] - step[26];
+  output[27] = step[24] - step[27];
+  output[28] = step[31] - step[28];
+  output[29] = step[30] - step[29];
+  output[30] = step[30] + step[29];
+  output[31] = step[31] + step[28];
+
+  range_check(output, 32, 18);
+
+  // stage 6
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+  step[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+  step[7] = (tran_low_t)fdct_round_shift(temp);
+  step[8] = output[8] + output[9];
+  step[9] = output[8] - output[9];
+  step[10] = output[11] - output[10];
+  step[11] = output[11] + output[10];
+  step[12] = output[12] + output[13];
+  step[13] = output[12] - output[13];
+  step[14] = output[15] - output[14];
+  step[15] = output[15] + output[14];
+  step[16] = output[16];
+  temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
+  step[17] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  step[19] = output[19];
+  step[20] = output[20];
+  temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  step[23] = output[23];
+  step[24] = output[24];
+  temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  step[27] = output[27];
+  step[28] = output[28];
+  temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
+  step[30] = (tran_low_t)fdct_round_shift(temp);
+  step[31] = output[31];
+
+  range_check(step, 32, 18);
+
+  // stage 7
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = step[4];
+  output[5] = step[5];
+  output[6] = step[6];
+  output[7] = step[7];
+  temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
+  output[8] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
+  output[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
+  output[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
+  output[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
+  output[14] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
+  output[15] = (tran_low_t)fdct_round_shift(temp);
+  output[16] = step[16] + step[17];
+  output[17] = step[16] - step[17];
+  output[18] = step[19] - step[18];
+  output[19] = step[19] + step[18];
+  output[20] = step[20] + step[21];
+  output[21] = step[20] - step[21];
+  output[22] = step[23] - step[22];
+  output[23] = step[23] + step[22];
+  output[24] = step[24] + step[25];
+  output[25] = step[24] - step[25];
+  output[26] = step[27] - step[26];
+  output[27] = step[27] + step[26];
+  output[28] = step[28] + step[29];
+  output[29] = step[28] - step[29];
+  output[30] = step[31] - step[30];
+  output[31] = step[31] + step[30];
+
+  range_check(output, 32, 18);
+
+  // stage 8
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = output[10];
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = output[13];
+  step[14] = output[14];
+  step[15] = output[15];
+  temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
+  step[16] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
+  step[17] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
+  step[19] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
+  step[23] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
+  step[24] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
+  step[28] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
+  step[30] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
+  step[31] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 32, 18);
+
+  // stage 9
+  output[0] = step[0];
+  output[1] = step[16];
+  output[2] = step[8];
+  output[3] = step[24];
+  output[4] = step[4];
+  output[5] = step[20];
+  output[6] = step[12];
+  output[7] = step[28];
+  output[8] = step[2];
+  output[9] = step[18];
+  output[10] = step[10];
+  output[11] = step[26];
+  output[12] = step[6];
+  output[13] = step[22];
+  output[14] = step[14];
+  output[15] = step[30];
+  output[16] = step[1];
+  output[17] = step[17];
+  output[18] = step[9];
+  output[19] = step[25];
+  output[20] = step[5];
+  output[21] = step[21];
+  output[22] = step[13];
+  output[23] = step[29];
+  output[24] = step[3];
+  output[25] = step[19];
+  output[26] = step[11];
+  output[27] = step[27];
+  output[28] = step[7];
+  output[29] = step[23];
+  output[30] = step[15];
+  output[31] = step[31];
+
+  range_check(output, 32, 18);
 }

 static void fadst4(const tran_low_t *input, tran_low_t *output) {
@ -727,19 +1215,19 @@ void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);

-      // Stage 2
+      // stage 2
      t0 = (s6 - s5) * cospi_16_64;
      t1 = (s6 + s5) * cospi_16_64;
      t2 = fdct_round_shift(t0);
      t3 = fdct_round_shift(t1);

-      // Stage 3
+      // stage 3
      x0 = s4 + t2;
      x1 = s4 - t2;
      x2 = s7 - t3;
      x3 = s7 + t3;

-      // Stage 4
+      // stage 4
      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
--- a/vp10/encoder/denoiser.c
+++ b/vp10/encoder/denoiser.c
@ -11,6 +11,7 @@
 #include <assert.h>
 #include <limits.h>
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 #include "vp10/common/reconinter.h"
@ -124,10 +125,10 @@ int vp10_denoiser_filter_c(const uint8_t *sig, int sig_stride,
            adj = adj_val[2];
        }
        if (diff > 0) {
-          avg[c] = MIN(UINT8_MAX, sig[c] + adj);
+          avg[c] = VPXMIN(UINT8_MAX, sig[c] + adj);
          total_adj += adj;
        } else {
-          avg[c] = MAX(0, sig[c] - adj);
+          avg[c] = VPXMAX(0, sig[c] - adj);
          total_adj -= adj;
        }
      }
@ -164,13 +165,13 @@ int vp10_denoiser_filter_c(const uint8_t *sig, int sig_stride,
        // Diff positive means we made positive adjustment above
        // (in first try/attempt), so now make negative adjustment to bring
        // denoised signal down.
-        avg[c] = MAX(0, avg[c] - adj);
+        avg[c] = VPXMAX(0, avg[c] - adj);
        total_adj -= adj;
      } else {
        // Diff negative means we made negative adjustment above
        // (in first try/attempt), so now make positive adjustment to bring
        // denoised signal up.
-        avg[c] = MIN(UINT8_MAX, avg[c] + adj);
+        avg[c] = VPXMIN(UINT8_MAX, avg[c] + adj);
        total_adj += adj;
      }
    }
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@ -1380,7 +1380,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
  if (p->eobs[block])
    *(args->skip) = 0;

-  if (x->skip_encode || p->eobs[block] == 0)
+  if (p->eobs[block] == 0)
    return;
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@ -1528,8 +1528,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  src_diff = &p->src_diff[4 * (j * diff_stride + i)];

  mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-  vp10_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
-                          x->skip_encode ? src_stride : dst_stride,
+  vp10_predict_intra_block(xd, bwl, tx_size, mode, dst, dst_stride,
                          dst, dst_stride, i, j, plane);

 #if CONFIG_VP9_HIGHBITDEPTH
@ -1546,7 +1545,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                      qcoeff, dqcoeff, pd->dequant, eob,
                                      scan_order->scan, scan_order->iscan);
        }
-        if (!x->skip_encode && *eob)
+        if (*eob)
          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, xd->bd,
                                         tx_type);
        break;
@ -1560,7 +1559,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
        }
-        if (!x->skip_encode && *eob)
+        if (*eob)
          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, xd->bd,
                                         tx_type);
        break;
@ -1574,7 +1573,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
        }
-        if (!x->skip_encode && *eob)
+        if (*eob)
          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, xd->bd,
                                       tx_type);
        break;
@ -1590,7 +1589,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                scan_order->scan, scan_order->iscan);
        }

-        if (!x->skip_encode && *eob)
+        if (*eob)
          // this is like vp10_short_idct4x4 but has a special case around
          // eob<=1 which is significant (not just an optimization) for the
          // lossless case.
@ -1619,7 +1618,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                             pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
      }
-      if (!x->skip_encode && *eob)
+      if (*eob)
        vp10_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
      break;
    case TX_16X16:
@ -1632,7 +1631,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                       pd->dequant, eob, scan_order->scan,
                       scan_order->iscan);
      }
-      if (!x->skip_encode && *eob)
+      if (*eob)
        vp10_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
      break;
    case TX_8X8:
@ -1645,7 +1644,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                       pd->dequant, eob, scan_order->scan,
                       scan_order->iscan);
      }
-      if (!x->skip_encode && *eob)
+      if (*eob)
        vp10_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
      break;
    case TX_4X4:
@ -1659,7 +1658,7 @@ void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                       scan_order->iscan);
      }

-      if (!x->skip_encode && *eob) {
+      if (*eob) {
        // this is like vp10_short_idct4x4 but has a special case around eob<=1
        // which is significant (not just an optimization) for the lossless
        // case.
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c
@ -16,6 +16,8 @@
 #include "vp10/encoder/cost.h"
 #include "vp10/encoder/encodemv.h"

+#include "vpx_dsp/vpx_dsp_common.h"
+
 static struct vp10_token mv_joint_encodings[MV_JOINTS];
 static struct vp10_token mv_class_encodings[MV_CLASSES];
 static struct vp10_token mv_fp_encodings[MV_FP_SIZE];
@ -216,8 +218,8 @@ void vp10_encode_mv(VP10_COMP* cpi, vpx_writer* w,
  // If auto_mv_step_size is enabled then keep track of the largest
  // motion vector component used.
  if (cpi->sf.mv.auto_mv_step_size) {
-    unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
+    unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude);
  }
 }

@ -237,7 +239,7 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
    const MV diff = {mvs[i].as_mv.row - ref->row,
                     mvs[i].as_mv.col - ref->col};
-    vp10_inc_mv(&diff, counts);
+    vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
  }
 }

--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@ -42,7 +42,6 @@
 #include "vp10/encoder/segmentation.h"
 #include "vp10/encoder/skin_detection.h"
 #include "vp10/encoder/speed_features.h"
-#include "vp10/encoder/svc_layercontext.h"
 #include "vp10/encoder/temporal_filter.h"

 #include "./vp10_rtcd.h"
@ -52,6 +51,7 @@
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@ -238,13 +238,11 @@ static void setup_frame(VP10_COMP *cpi) {
  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
    vp10_setup_past_independence(cm);
  } else {
-    if (!cpi->use_svc)
-      cm->frame_context_idx = cpi->refresh_alt_ref_frame;
+    cm->frame_context_idx = cpi->refresh_alt_ref_frame;
  }

  if (cm->frame_type == KEY_FRAME) {
-    if (!is_two_pass_svc(cpi))
-      cpi->refresh_golden_frame = 1;
+    cpi->refresh_golden_frame = 1;
    cpi->refresh_alt_ref_frame = 1;
    vp10_zero(cpi->interp_filter_selected);
  } else {
@ -337,7 +335,6 @@ void vp10_initialize_enc(void) {

 static void dealloc_compressor_data(VP10_COMP *cpi) {
  VP10_COMMON *const cm = &cpi->common;
-  int i;

  vpx_free(cpi->mbmi_ext_base);
  cpi->mbmi_ext_base = NULL;
@ -394,26 +391,10 @@ static void dealloc_compressor_data(VP10_COMP *cpi) {

  vp10_free_pc_tree(&cpi->td);

-  for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
-    vpx_free(lc->rc_twopass_stats_in.buf);
-    lc->rc_twopass_stats_in.buf = NULL;
-    lc->rc_twopass_stats_in.sz = 0;
-  }
-
  if (cpi->source_diff_var != NULL) {
    vpx_free(cpi->source_diff_var);
    cpi->source_diff_var = NULL;
  }
-
-  for (i = 0; i < MAX_LAG_BUFFERS; ++i) {
-    vpx_free_frame_buffer(&cpi->svc.scaled_frames[i]);
-  }
-  memset(&cpi->svc.scaled_frames[0], 0,
-         MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
-
-  vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
-  memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 }

 static void save_coding_context(VP10_COMP *cpi) {
@ -718,16 +699,9 @@ static void set_tile_limits(VP10_COMP *cpi) {
  int min_log2_tile_cols, max_log2_tile_cols;
  vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);

-  if (is_two_pass_svc(cpi) &&
-      (cpi->svc.encode_empty_frame_state == ENCODING ||
-      cpi->svc.number_spatial_layers > 1)) {
-    cm->log2_tile_cols = 0;
-    cm->log2_tile_rows = 0;
-  } else {
-    cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
-                               min_log2_tile_cols, max_log2_tile_cols);
-    cm->log2_tile_rows = cpi->oxcf.tile_rows;
-  }
+  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                             min_log2_tile_cols, max_log2_tile_cols);
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
 }

 static void update_frame_size(VP10_COMP *cpi) {
@ -742,19 +716,6 @@ static void update_frame_size(VP10_COMP *cpi) {
         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));

  set_tile_limits(cpi);
-
-  if (is_two_pass_svc(cpi)) {
-    if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer,
-                                 cm->width, cm->height,
-                                 cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                 cm->use_highbitdepth,
-#endif
-                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                 NULL, NULL, NULL))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to reallocate alt_ref_buffer");
-  }
 }

 static void init_buffer_indices(VP10_COMP *cpi) {
@ -775,28 +736,15 @@ static void init_config(struct VP10_COMP *cpi, VP10EncoderConfig *oxcf) {
  cm->use_highbitdepth = oxcf->use_highbitdepth;
 #endif
  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;

  cm->width = oxcf->width;
  cm->height = oxcf->height;
  vp10_alloc_compressor_data(cpi);

-  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
-
  // Single thread case: use counts in common.
  cpi->td.counts = &cm->counts;

-  // Spatial scalability.
-  cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
-  // Temporal scalability.
-  cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
-
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
-      ((cpi->svc.number_temporal_layers > 1 ||
-        cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass != 1)) {
-    vp10_init_layer_context(cpi);
-  }
-
  // change includes all joint functionality
  vp10_change_config(cpi, oxcf);

@ -1460,6 +1408,7 @@ void vp10_change_config(struct VP10_COMP *cpi, const VP10EncoderConfig *oxcf) {
    cm->profile = oxcf->profile;
  cm->bit_depth = oxcf->bit_depth;
  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;

  if (cm->profile <= PROFILE_1)
    assert(cm->bit_depth == VPX_BITS_8);
@ -1475,8 +1424,11 @@ void vp10_change_config(struct VP10_COMP *cpi, const VP10EncoderConfig *oxcf) {

  cpi->refresh_golden_frame = 0;
  cpi->refresh_last_frame = 1;
-  cm->refresh_frame_context = 1;
-  cm->reset_frame_context = 0;
+  cm->refresh_frame_context =
+      oxcf->error_resilient_mode ? REFRESH_FRAME_CONTEXT_OFF :
+          oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
+                                             : REFRESH_FRAME_CONTEXT_BACKWARD;
+  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;

  vp10_reset_segment_features(&cm->seg);
  vp10_set_high_precision_mv(cpi, 0);
@ -1493,8 +1445,8 @@ void vp10_change_config(struct VP10_COMP *cpi, const VP10EncoderConfig *oxcf) {

  // Under a configuration change, where maximum_buffer_size may change,
  // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size);

  // Set up frame rate and related parameters rate control values.
  vp10_new_framerate(cpi, cpi->framerate);
@ -1520,15 +1472,6 @@ void vp10_change_config(struct VP10_COMP *cpi, const VP10EncoderConfig *oxcf) {
  }
  update_frame_size(cpi);

-  if ((cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.rc_mode == VPX_CBR) ||
-      ((cpi->svc.number_temporal_layers > 1 ||
-        cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass != 1)) {
-    vp10_update_layer_context_change_config(cpi,
-                                           (int)cpi->oxcf.target_bandwidth);
-  }
-
  cpi->alt_ref_source = NULL;
  rc->is_src_frame_alt_ref = 0;

@ -1619,7 +1562,6 @@ VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
                  sizeof(*cm->frame_contexts)));

-  cpi->use_svc = 0;
  cpi->resize_state = 0;
  cpi->resize_avg_qp = 0;
  cpi->resize_buffer_underflow = 0;
@ -1758,63 +1700,24 @@ VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);

-    if (cpi->svc.number_spatial_layers > 1
-        || cpi->svc.number_temporal_layers > 1) {
-      FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
-      FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
-      int i;
-
-      for (i = 0; i < oxcf->ss_number_layers; ++i) {
-        FIRSTPASS_STATS *const last_packet_for_layer =
-            &stats[packets - oxcf->ss_number_layers + i];
-        const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
-        const int packets_in_layer = (int)last_packet_for_layer->count + 1;
-        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
-          LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id];
-
-          vpx_free(lc->rc_twopass_stats_in.buf);
-
-          lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
-          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
-                          vpx_malloc(lc->rc_twopass_stats_in.sz));
-          lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
-          lc->twopass.stats_in = lc->twopass.stats_in_start;
-          lc->twopass.stats_in_end = lc->twopass.stats_in_start
-                                     + packets_in_layer - 1;
-          stats_copy[layer_id] = lc->rc_twopass_stats_in.buf;
-        }
-      }
-
-      for (i = 0; i < packets; ++i) {
-        const int layer_id = (int)stats[i].spatial_layer_id;
-        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers
-            && stats_copy[layer_id] != NULL) {
-          *stats_copy[layer_id] = stats[i];
-          ++stats_copy[layer_id];
-        }
-      }
-
-      vp10_init_second_pass_spatial_svc(cpi);
-    } else {
 #if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        const size_t psz = cpi->common.MBs * sizeof(uint8_t);
-        const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
+    if (cpi->use_fp_mb_stats) {
+      const size_t psz = cpi->common.MBs * sizeof(uint8_t);
+      const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);

-        cpi->twopass.firstpass_mb_stats.mb_stats_start =
-            oxcf->firstpass_mb_stats_in.buf;
-        cpi->twopass.firstpass_mb_stats.mb_stats_end =
-            cpi->twopass.firstpass_mb_stats.mb_stats_start +
-            (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
-      }
+      cpi->twopass.firstpass_mb_stats.mb_stats_start =
+          oxcf->firstpass_mb_stats_in.buf;
+      cpi->twopass.firstpass_mb_stats.mb_stats_end =
+          cpi->twopass.firstpass_mb_stats.mb_stats_start +
+          (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
+    }
 #endif

-      cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
-      cpi->twopass.stats_in = cpi->twopass.stats_in_start;
-      cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+    cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];

-      vp10_init_second_pass(cpi);
-    }
+    vp10_init_second_pass(cpi);
  }

  vp10_set_speed_features_framesize_independent(cpi);
@ -2248,42 +2151,6 @@ typedef struct {
  uint32_t samples[4];  // total/y/u/v
 } PSNR_STATS;

-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                      PSNR_STATS *psnr) {
-  static const double peak = 255.0;
-  const int widths[3]        = {
-      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
-  const int heights[3]       = {
-      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
-  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
-  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
-                                 b_planes[i], b_strides[i],
-                                 w, h);
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
 static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                             const YV12_BUFFER_CONFIG *b,
@ -2336,6 +2203,44 @@ static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
                                  (double)total_sse);
 }
+
+#else  // !CONFIG_VP9_HIGHBITDEPTH
+
+static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3]        = {
+      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
+  const int heights[3]       = {
+      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
+  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
+  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+                                 b_planes[i], b_strides[i],
+                                 w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 static void generate_psnr_packet(VP10_COMP *cpi) {
@ -2355,11 +2260,7 @@ static void generate_psnr_packet(VP10_COMP *cpi) {
    pkt.data.psnr.psnr[i] = psnr.psnr[i];
  }
  pkt.kind = VPX_CODEC_PSNR_PKT;
-  if (cpi->use_svc)
-    cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-        cpi->svc.number_temporal_layers].psnr_pkt = pkt.data.psnr;
-  else
-    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }

 int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags) {
@ -2616,7 +2517,7 @@ static int scale_down(VP10_COMP *cpi, int q) {
  if (rc->frame_size_selector == UNSCALED &&
      q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
    const int max_size_thresh = (int)(rate_thresh_mult[SCALE_STEP1]
-        * MAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+        * VPXMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
    scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
  }
  return scale;
@ -2688,11 +2589,6 @@ void vp10_update_reference_frames(VP10_COMP *cpi) {
    tmp = cpi->alt_fb_idx;
    cpi->alt_fb_idx = cpi->gld_fb_idx;
    cpi->gld_fb_idx = tmp;
-
-    if (is_two_pass_svc(cpi)) {
-      cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
-      cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
-    }
  } else { /* For non key/golden frames */
    if (cpi->refresh_alt_ref_frame) {
      int arf_idx = cpi->alt_fb_idx;
@ -2864,7 +2760,7 @@ void vp10_scale_references(VP10_COMP *cpi) {
        ++buf->ref_count;
      }
    } else {
-      if (cpi->oxcf.pass != 0 || cpi->use_svc)
+      if (cpi->oxcf.pass != 0)
        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
    }
  }
@ -2873,7 +2769,7 @@ void vp10_scale_references(VP10_COMP *cpi) {
 static void release_scaled_references(VP10_COMP *cpi) {
  VP10_COMMON *cm = &cpi->common;
  int i;
-  if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+  if (cpi->oxcf.pass == 0) {
    // Only release scaled references under certain conditions:
    // if reference will be updated, or if scaled reference has same resolution.
    int refresh[3];
@ -2999,7 +2895,7 @@ static void output_frame_level_debug_stats(VP10_COMP *cpi) {

 static void set_mv_search_params(VP10_COMP *cpi) {
  const VP10_COMMON *const cm = &cpi->common;
-  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+  const unsigned int max_mv_def = VPXMIN(cm->width, cm->height);

  // Default based on max resolution.
  cpi->mv_step_param = vp10_init_search_range(max_mv_def);
@ -3014,8 +2910,8 @@ static void set_mv_search_params(VP10_COMP *cpi) {
        // Allow mv_steps to correspond to twice the max mv magnitude found
        // in the previous frame, capped by the default max_mv_magnitude based
        // on resolution.
-        cpi->mv_step_param =
-            vp10_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+        cpi->mv_step_param = vp10_init_search_range(
+            VPXMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
      }
      cpi->max_mv_magnitude = 0;
    }
@ -3107,7 +3003,6 @@ static void set_frame_size(VP10_COMP *cpi) {

  if (oxcf->pass == 0 &&
      oxcf->rc_mode == VPX_CBR &&
-      !cpi->use_svc &&
      oxcf->resize_mode == RESIZE_DYNAMIC) {
      if (cpi->resize_pending == 1) {
        oxcf->scaled_frame_width =
@ -3130,10 +3025,7 @@ static void set_frame_size(VP10_COMP *cpi) {
      }
  }

-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc ||
-          (is_two_pass_svc(cpi) &&
-              cpi->svc.encode_empty_frame_state != ENCODING))) {
+  if (oxcf->pass == 2) {
    vp10_set_target_rate(cpi);
  }

@ -3240,10 +3132,9 @@ static void encode_without_recode_loop(VP10_COMP *cpi) {
  vp10_encode_frame(cpi);

  // Update some stats from cyclic refresh, and check if we should not update
-  // golden reference, for non-SVC 1 pass CBR.
+  // golden reference, for 1 pass CBR.
  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
      cm->frame_type != KEY_FRAME &&
-      !cpi->use_svc &&
      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
    vp10_cyclic_refresh_check_golden_update(cpi);

@ -3342,8 +3233,7 @@ static void encode_with_recode_loop(VP10_COMP *cpi,
    // to recode.
    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
      save_coding_context(cpi);
-      if (!cpi->sf.use_nonrd_pick_mode)
-        vp10_pack_bitstream(cpi, dest, size);
+      vp10_pack_bitstream(cpi, dest, size);

      rc->projected_frame_size = (int)(*size) << 3;
      restore_coding_context(cpi);
@ -3388,7 +3278,7 @@ static void encode_with_recode_loop(VP10_COMP *cpi,

          // Adjust Q
          q = (int)((q * high_err_target) / kf_err);
-          q = MIN(q, (q_high + q_low) >> 1);
+          q = VPXMIN(q, (q_high + q_low) >> 1);
        } else if (kf_err < low_err_target &&
                   rc->projected_frame_size >= frame_under_shoot_limit) {
          // The key frame is much better than the previous frame
@ -3397,7 +3287,7 @@ static void encode_with_recode_loop(VP10_COMP *cpi,

          // Adjust Q
          q = (int)((q * low_err_target) / kf_err);
-          q = MIN(q, (q_high + q_low + 1) >> 1);
+          q = VPXMIN(q, (q_high + q_low + 1) >> 1);
        }

        // Clamp Q to upper and lower limits:
@ -3406,7 +3296,7 @@ static void encode_with_recode_loop(VP10_COMP *cpi,
        loop = q != last_q;
      } else if (recode_loop_test(
          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          q, MAX(q_high, top_index), bottom_index)) {
+          q, VPXMAX(q_high, top_index), bottom_index)) {
        // Is the projected frame size out of range and are we allowed
        // to attempt to recode.
        int last_q = q;
@ -3448,12 +3338,12 @@ static void encode_with_recode_loop(VP10_COMP *cpi,
            vp10_rc_update_rate_correction_factors(cpi);

            q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
-                                   bottom_index, MAX(q_high, top_index));
+                                   bottom_index, VPXMAX(q_high, top_index));

            while (q < q_low && retries < 10) {
              vp10_rc_update_rate_correction_factors(cpi);
              q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
-                                     bottom_index, MAX(q_high, top_index));
+                                     bottom_index, VPXMAX(q_high, top_index));
              retries++;
            }
          }
@ -3525,9 +3415,7 @@ static int get_ref_frame_flags(const VP10_COMP *cpi) {
  if (gold_is_last)
    flags &= ~VP9_GOLD_FLAG;

-  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
-      (cpi->svc.number_temporal_layers == 1 &&
-       cpi->svc.number_spatial_layers == 1))
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
    flags &= ~VP9_GOLD_FLAG;

  if (alt_is_last)
@ -3667,54 +3555,14 @@ static void encode_frame_to_data_rate(VP10_COMP *cpi,
    cpi->rc.source_alt_ref_active = 0;

    cm->error_resilient_mode = oxcf->error_resilient_mode;
-    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;

    // By default, encoder assumes decoder can use prev_mi.
    if (cm->error_resilient_mode) {
-      cm->frame_parallel_decoding_mode = 1;
-      cm->reset_frame_context = 0;
-      cm->refresh_frame_context = 0;
+      cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
    } else if (cm->intra_only) {
      // Only reset the current context.
-      cm->reset_frame_context = 2;
-    }
-  }
-  if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
-    // Use context 0 for intra only empty frame, but the last frame context
-    // for other empty frames.
-    if (cpi->svc.encode_empty_frame_state == ENCODING) {
-      if (cpi->svc.encode_intra_empty_frame != 0)
-        cm->frame_context_idx = 0;
-      else
-        cm->frame_context_idx = FRAME_CONTEXTS - 1;
-    } else {
-    cm->frame_context_idx =
-        cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
-        cpi->svc.temporal_layer_id;
-    }
-
-    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
-
-    // The probs will be updated based on the frame type of its previous
-    // frame if frame_parallel_decoding_mode is 0. The type may vary for
-    // the frame after a key frame in base layer since we may drop enhancement
-    // layers. So set frame_parallel_decoding_mode to 1 in this case.
-    if (cm->frame_parallel_decoding_mode == 0) {
-      if (cpi->svc.number_temporal_layers == 1) {
-        if (cpi->svc.spatial_layer_id == 0 &&
-            cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
-          cm->frame_parallel_decoding_mode = 1;
-      } else if (cpi->svc.spatial_layer_id == 0) {
-        // Find the 2nd frame in temporal base layer and 1st frame in temporal
-        // enhancement layers from the key frame.
-        int i;
-        for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
-          if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
-            cm->frame_parallel_decoding_mode = 1;
-            break;
-          }
-        }
-      }
+      cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
    }
  }

@ -3778,6 +3626,8 @@ static void encode_frame_to_data_rate(VP10_COMP *cpi,
    cpi->refresh_last_frame = 1;

  cm->frame_to_show = get_frame_new_buffer(cm);
+  cm->frame_to_show->color_space = cm->color_space;
+  cm->frame_to_show->color_range = cm->color_range;

  // Pick the loop filter level for the frame.
  loopfilter_frame(cpi, cm);
@ -3797,11 +3647,11 @@ static void encode_frame_to_data_rate(VP10_COMP *cpi,
    full_to_model_counts(cpi->td.counts->coef[t],
                         cpi->td.rd_counts.coef_counts[t]);

-  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD)
    vp10_adapt_coef_probs(cm);

  if (!frame_is_intra_only(cm)) {
-    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
      vp10_adapt_mode_probs(cm);
      vp10_adapt_mv_probs(cm, cm->allow_high_precision_mv);
    }
@ -3821,8 +3671,7 @@ static void encode_frame_to_data_rate(VP10_COMP *cpi,

  cm->last_frame_type = cm->frame_type;

-  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
-    vp10_rc_postencode_update(cpi, *size);
+  vp10_rc_postencode_update(cpi, *size);

 #if 0
  output_frame_level_debug_stats(cpi);
@ -3854,22 +3703,8 @@ static void encode_frame_to_data_rate(VP10_COMP *cpi,
    // Don't increment frame counters if this was an altref buffer
    // update not a real frame
    ++cm->current_video_frame;
-    if (cpi->use_svc)
-      vp10_inc_frame_in_layer(cpi);
  }
  cm->prev_frame = cm->cur_frame;
-
-  if (cpi->use_svc)
-    cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-                           cpi->svc.number_temporal_layers +
-                           cpi->svc.temporal_layer_id].last_frame_type =
-                               cm->frame_type;
-}
-
-static void SvcEncode(VP10_COMP *cpi, size_t *size, uint8_t *dest,
-                      unsigned int *frame_flags) {
-  vp10_rc_get_svc_params(cpi);
-  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }

 static void Pass0Encode(VP10_COMP *cpi, size_t *size, uint8_t *dest,
@ -3887,8 +3722,7 @@ static void Pass2Encode(VP10_COMP *cpi, size_t *size,
  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
  encode_frame_to_data_rate(cpi, size, dest, frame_flags);

-  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
-    vp10_twopass_postencode_update(cpi);
+  vp10_twopass_postencode_update(cpi);
 }

 static void init_ref_frame_bufs(VP10_COMMON *cm) {
@ -4000,7 +3834,7 @@ static int frame_is_reference(const VP10_COMP *cpi) {
         cpi->refresh_last_frame ||
         cpi->refresh_golden_frame ||
         cpi->refresh_alt_ref_frame ||
-         cm->refresh_frame_context ||
+         cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF ||
         cm->lf.mode_ref_delta_update ||
         cm->seg.update_map ||
         cm->seg.update_data;
@ -4032,8 +3866,8 @@ static void adjust_frame_rate(VP10_COMP *cpi,
      // Average this frame's rate into the last second's average
      // frame rate. If we haven't seen 1 second yet, then average
      // over the whole interval seen.
-      const double interval = MIN((double)(source->ts_end
-                                   - cpi->first_time_stamp_ever), 10000000.0);
+      const double interval = VPXMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
      double avg_duration = 10000000.0 / cpi->framerate;
      avg_duration *= (interval - avg_duration + this_duration);
      avg_duration /= interval;
@ -4097,7 +3931,7 @@ static void adjust_image_stat(double y, double u, double v, double all,
  s->stat[U] += u;
  s->stat[V] += v;
  s->stat[ALL] += all;
-  s->worst = MIN(s->worst, all);
+  s->worst = VPXMIN(s->worst, all);
 }
 #endif  // CONFIG_INTERNAL_STATS

@ -4115,68 +3949,37 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
  int arf_src_index;
  int i;

-  if (is_two_pass_svc(cpi)) {
-#if CONFIG_SPATIAL_SVC
-    vp10_svc_start_frame(cpi);
-    // Use a small empty frame instead of a real frame
-    if (cpi->svc.encode_empty_frame_state == ENCODING)
-      source = &cpi->svc.empty_frame;
-#endif
-    if (oxcf->pass == 2)
-      vp10_restore_layer_context(cpi);
-  } else if (is_one_pass_cbr_svc(cpi)) {
-    vp10_one_pass_cbr_svc_start_layer(cpi);
-  }
-
  vpx_usec_timer_start(&cmptimer);

  vp10_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);

  // Is multi-arf enabled.
-  // Note that at the moment multi_arf is only configured for 2 pass VBR and
-  // will not work properly with svc.
-  if ((oxcf->pass == 2) && !cpi->use_svc &&
-      (cpi->oxcf.enable_auto_arf > 1))
+  // Note that at the moment multi_arf is only configured for 2 pass VBR
+  if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1))
    cpi->multi_arf_allowed = 1;
  else
    cpi->multi_arf_allowed = 0;

  // Normal defaults
-  cm->reset_frame_context = 0;
-  cm->refresh_frame_context = 1;
-  if (!is_one_pass_cbr_svc(cpi)) {
-    cpi->refresh_last_frame = 1;
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_alt_ref_frame = 0;
-  }
+  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+  cm->refresh_frame_context =
+      oxcf->error_resilient_mode ? REFRESH_FRAME_CONTEXT_OFF :
+          oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
+                                             : REFRESH_FRAME_CONTEXT_BACKWARD;
+
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;

  // Should we encode an arf frame.
  arf_src_index = get_arf_src_index(cpi);

-  // Skip alt frame if we encode the empty frame
-  if (is_two_pass_svc(cpi) && source != NULL)
-    arf_src_index = 0;
-
  if (arf_src_index) {
    assert(arf_src_index <= rc->frames_to_key);

    if ((source = vp10_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
      cpi->alt_ref_source = source;

-#if CONFIG_SPATIAL_SVC
-      if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
-        int i;
-        // Reference a hidden frame from a lower layer
-        for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
-          if (oxcf->ss_enable_auto_arf[i]) {
-            cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
-            break;
-          }
-        }
-      }
-      cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
-#endif
-
      if (oxcf->arnr_max_frames > 0) {
        // Produce the filtered ARF frame.
        vp10_temporal_filter(cpi, arf_src_index);
@ -4204,21 +4007,11 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
    }

    // Read in the source frame.
-    if (cpi->use_svc)
-      source = vp10_svc_lookahead_pop(cpi, cpi->lookahead, flush);
-    else
-      source = vp10_lookahead_pop(cpi->lookahead, flush);
+    source = vp10_lookahead_pop(cpi->lookahead, flush);

    if (source != NULL) {
      cm->show_frame = 1;
      cm->intra_only = 0;
-      // if the flags indicate intra frame, but if the current picture is for
-      // non-zero spatial layer, it should not be an intra picture.
-      // TODO(Won Kap): this needs to change if per-layer intra frame is
-      // allowed.
-      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->svc.spatial_layer_id) {
-        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
-      }

      // Check to see if the frame should be encoded as an arf overlay.
      check_src_altref(cpi, source);
@ -4257,11 +4050,6 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
    adjust_frame_rate(cpi, source);
  }

-  if (is_one_pass_cbr_svc(cpi)) {
-    vp10_update_temporal_layer_framerate(cpi);
-    vp10_restore_layer_context(cpi);
-  }
-
  // Find a free buffer for the new frame, releasing the reference previously
  // held.
  if (cm->new_fb_idx != INVALID_IDX) {
@ -4274,7 +4062,7 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,

  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];

-  if (!cpi->use_svc && cpi->multi_arf_allowed) {
+  if (cpi->multi_arf_allowed) {
    if (cm->frame_type == KEY_FRAME) {
      init_buffer_indices(cpi);
    } else if (oxcf->pass == 2) {
@ -4288,24 +4076,18 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,

  cpi->frame_flags = *frame_flags;

-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc ||
-          (is_two_pass_svc(cpi) &&
-              cpi->svc.encode_empty_frame_state != ENCODING))) {
+  if (oxcf->pass == 2) {
    vp10_rc_get_second_pass_params(cpi);
  } else if (oxcf->pass == 1) {
    set_frame_size(cpi);
  }

-  if (cpi->oxcf.pass != 0 ||
-      cpi->use_svc ||
-      frame_is_intra_only(cm) == 1) {
+  if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
    for (i = 0; i < MAX_REF_FRAMES; ++i)
      cpi->scaled_ref_idx[i] = INVALID_IDX;
  }

-  if (oxcf->pass == 1 &&
-      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+  if (oxcf->pass == 1) {
    const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
    if (cpi->oxcf.use_highbitdepth)
@ -4320,17 +4102,14 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    cpi->td.mb.itxm_add = lossless ? vp10_iwht4x4_add : vp10_idct4x4_add;
    vp10_first_pass(cpi, source);
-  } else if (oxcf->pass == 2 &&
-      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+  } else if (oxcf->pass == 2) {
    Pass2Encode(cpi, size, dest, frame_flags);
-  } else if (cpi->use_svc) {
-    SvcEncode(cpi, size, dest, frame_flags);
  } else {
    // One pass encode
    Pass0Encode(cpi, size, dest, frame_flags);
  }

-  if (cm->refresh_frame_context)
+  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;

  // No frame encoded, or frame was dropped, release scaled references.
@ -4342,14 +4121,6 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
    cpi->droppable = !frame_is_reference(cpi);
  }

-  // Save layer specific state.
-  if (is_one_pass_cbr_svc(cpi) ||
-        ((cpi->svc.number_temporal_layers > 1 ||
-          cpi->svc.number_spatial_layers > 1) &&
-         oxcf->pass == 2)) {
-    vp10_save_layer_context(cpi);
-  }
-
  vpx_usec_timer_mark(&cmptimer);
  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);

@ -4427,7 +4198,7 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
          frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-          cpi->worst_ssim= MIN(cpi->worst_ssim, frame_ssim2);
+          cpi->worst_ssim= VPXMIN(cpi->worst_ssim, frame_ssim2);
          cpi->summed_quality += frame_ssim2 * weight;
          cpi->summed_weights += weight;

@ -4464,7 +4235,8 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
              cpi->Source->y_buffer, cpi->Source->y_stride,
              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
              cpi->Source->y_width, cpi->Source->y_height);
-          cpi->worst_blockiness = MAX(cpi->worst_blockiness, frame_blockiness);
+          cpi->worst_blockiness =
+              VPXMAX(cpi->worst_blockiness, frame_blockiness);
          cpi->total_blockiness += frame_blockiness;
        }
      }
@ -4484,8 +4256,8 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
          double consistency = vpx_sse_to_psnr(samples, peak,
                                             (double)cpi->total_inconsistency);
          if (consistency > 0.0)
-            cpi->worst_consistency = MIN(cpi->worst_consistency,
-                                         consistency);
+            cpi->worst_consistency =
+                VPXMIN(cpi->worst_consistency, consistency);
          cpi->total_inconsistency += this_inconsistency;
        }
      }
@ -4527,27 +4299,6 @@ int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
  }
 #endif

-  if (is_two_pass_svc(cpi)) {
-    if (cpi->svc.encode_empty_frame_state == ENCODING) {
-      cpi->svc.encode_empty_frame_state = ENCODED;
-      cpi->svc.encode_intra_empty_frame = 0;
-    }
-
-    if (cm->show_frame) {
-      ++cpi->svc.spatial_layer_to_encode;
-      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
-        cpi->svc.spatial_layer_to_encode = 0;
-
-      // May need the empty frame after an visible frame.
-      cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE;
-    }
-  } else if (is_one_pass_cbr_svc(cpi)) {
-    if (cm->show_frame) {
-      ++cpi->svc.spatial_layer_to_encode;
-      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
-        cpi->svc.spatial_layer_to_encode = 0;
-    }
-  }
  vpx_clear_system_state();
  return 0;
 }
@ -4640,11 +4391,6 @@ int vp10_set_size_literal(VP10_COMP *cpi, unsigned int width,
  return 0;
 }

-void vp10_set_svc(VP10_COMP *cpi, int use_svc) {
-  cpi->use_svc = use_svc;
-  return;
-}
-
 int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a,
                      const YV12_BUFFER_CONFIG *b) {
  assert(a->y_crop_width == b->y_crop_width);
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@ -33,7 +33,6 @@
 #include "vp10/encoder/ratectrl.h"
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/speed_features.h"
-#include "vp10/encoder/svc_layercontext.h"
 #include "vp10/encoder/tokenize.h"

 #if CONFIG_VP9_TEMPORAL_DENOISING
@ -116,7 +115,7 @@ typedef enum {
 } AQ_MODE;

 typedef enum {
-  RESIZE_NONE = 0,    // No frame resizing allowed (except for SVC).
+  RESIZE_NONE = 0,    // No frame resizing allowed.
  RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
  RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
 } RESIZE_TYPE;
@ -189,16 +188,6 @@ typedef struct VP10EncoderConfig {
  // END DATARATE CONTROL OPTIONS
  // ----------------------------------------------------------------

-  // Spatial and temporal scalability.
-  int ss_number_layers;  // Number of spatial layers.
-  int ts_number_layers;  // Number of temporal layers.
-  // Bitrate allocation for spatial layers.
-  int layer_target_bitrate[VPX_MAX_LAYERS];
-  int ss_target_bitrate[VPX_SS_MAX_LAYERS];
-  int ss_enable_auto_arf[VPX_SS_MAX_LAYERS];
-  // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
-  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
-
  int enable_auto_arf;

  int encode_breakout;  // early breakout : for video conf recommend 800
@ -239,7 +228,7 @@ typedef struct VP10EncoderConfig {
  int use_highbitdepth;
 #endif
  vpx_color_space_t color_space;
-  VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+  int color_range;
 } VP10EncoderConfig;

 static INLINE int is_lossless_requested(const VP10EncoderConfig *cfg) {
@ -451,10 +440,6 @@ typedef struct VP10_COMP {
                    // number of MBs in the current frame when the frame is
                    // scaled.

-  int use_svc;
-
-  SVC svc;
-
  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
  diff *source_diff_var;
  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
@ -549,8 +534,6 @@ int vp10_set_internal_size(VP10_COMP *cpi,
 int vp10_set_size_literal(VP10_COMP *cpi, unsigned int width,
                         unsigned int height);

-void vp10_set_svc(VP10_COMP *cpi, int use_svc);
-
 int vp10_get_quantizer(struct VP10_COMP *cpi);

 static INLINE int frame_is_kf_gf_arf(const VP10_COMP *cpi) {
@ -627,19 +610,9 @@ YV12_BUFFER_CONFIG *vp10_scale_if_required(VP10_COMMON *cm,

 void vp10_apply_encoding_flags(VP10_COMP *cpi, vpx_enc_frame_flags_t flags);

-static INLINE int is_two_pass_svc(const struct VP10_COMP *const cpi) {
-  return cpi->use_svc && cpi->oxcf.pass != 0;
-}
-
-static INLINE int is_one_pass_cbr_svc(const struct VP10_COMP *const cpi) {
-  return (cpi->use_svc && cpi->oxcf.pass == 0);
-}
-
 static INLINE int is_altref_enabled(const VP10_COMP *const cpi) {
  return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
-         (cpi->oxcf.enable_auto_arf &&
-          (!is_two_pass_svc(cpi) ||
-           cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
+         cpi->oxcf.enable_auto_arf;
 }

 static INLINE void set_ref_ptrs(VP10_COMMON *cm, MACROBLOCKD *xd,
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@ -11,6 +11,7 @@
 #include "vp10/encoder/encodeframe.h"
 #include "vp10/encoder/encoder.h"
 #include "vp10/encoder/ethread.h"
+#include "vpx_dsp/vpx_dsp_common.h"

 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
  int i, j, k, l, m, n;
@ -51,23 +52,11 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
  return 0;
 }

-static int get_max_tile_cols(VP10_COMP *cpi) {
-  const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2);
-  int mi_cols = aligned_width >> MI_SIZE_LOG2;
-  int min_log2_tile_cols, max_log2_tile_cols;
-  int log2_tile_cols;
-
-  vp10_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
-  log2_tile_cols = clamp(cpi->oxcf.tile_columns,
-                   min_log2_tile_cols, max_log2_tile_cols);
-  return (1 << log2_tile_cols);
-}
-
 void vp10_encode_tiles_mt(VP10_COMP *cpi) {
  VP10_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
+  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
  int i;

  vp10_init_tile_data(cpi);
@ -76,13 +65,6 @@ void vp10_encode_tiles_mt(VP10_COMP *cpi) {
  if (cpi->num_workers == 0) {
    int allocated_workers = num_workers;

-    // While using SVC, we need to allocate threads according to the highest
-    // resolution.
-    if (cpi->use_svc) {
-      int max_tile_cols = get_max_tile_cols(cpi);
-      allocated_workers = MIN(cpi->oxcf.max_threads, max_tile_cols);
-    }
-
    CHECK_MEM_ERROR(cm, cpi->workers,
                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));

@ -146,23 +128,6 @@ void vp10_encode_tiles_mt(VP10_COMP *cpi) {
      memcpy(thread_data->td->counts, &cpi->common.counts,
             sizeof(cpi->common.counts));
    }
-
-    // Handle use_nonrd_pick_mode case.
-    if (cpi->sf.use_nonrd_pick_mode) {
-      MACROBLOCK *const x = &thread_data->td->mb;
-      MACROBLOCKD *const xd = &x->e_mbd;
-      struct macroblock_plane *const p = x->plane;
-      struct macroblockd_plane *const pd = xd->plane;
-      PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
-      int j;
-
-      for (j = 0; j < MAX_MB_PLANE; ++j) {
-        p[j].coeff = ctx->coeff_pbuf[j][0];
-        p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
-        pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
-        p[j].eobs = ctx->eobs_pbuf[j][0];
-      }
-    }
  }

  // Encode a frame
--- a/vp10/encoder/ethread.h
+++ b/vp10/encoder/ethread.h
@ -11,6 +11,10 @@
 #ifndef VP10_ENCODER_ETHREAD_H_
 #define VP10_ENCODER_ETHREAD_H_

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP10_COMP;
 struct ThreadData;

@ -22,4 +26,8 @@ typedef struct EncWorkerData {

 void vp10_encode_tiles_mt(struct VP10_COMP *cpi);

+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP10_ENCODER_ETHREAD_H_
--- a/vp10/encoder/extend.c
+++ b/vp10/encoder/extend.c
@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"

@ -111,10 +112,12 @@ void vp10_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
  // Motion estimation may use src block variance with the block size up
  // to 64x64, so the right and bottom need to be extended to 64 multiple
  // or up to 16, whichever is greater.
-  const int er_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
-      - src->y_crop_width;
-  const int eb_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
-      - src->y_crop_height;
+  const int er_y =
+      VPXMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y =
+      VPXMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+      src->y_crop_height;
  const int uv_width_subsampling = (src->uv_width != src->y_width);
  const int uv_height_subsampling = (src->uv_height != src->y_height);
  const int et_uv = et_y >> uv_height_subsampling;
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@ -55,7 +56,6 @@
 #define MIN_DECAY_FACTOR    0.01
 #define MIN_KF_BOOST        300
 #define NEW_MV_MODE_PENALTY 32
-#define SVC_FACTOR_PT_LOW   0.45
 #define DARK_THRESH         64
 #define DEFAULT_GRP_WEIGHT  1.0
 #define RC_FACTOR_MIN       0.75
@ -177,14 +177,12 @@ static void zero_stats(FIRSTPASS_STATS *section) {
  section->new_mv_count = 0.0;
  section->count      = 0.0;
  section->duration   = 1.0;
-  section->spatial_layer_id = 0;
 }

 static void accumulate_stats(FIRSTPASS_STATS *section,
                             const FIRSTPASS_STATS *frame) {
  section->frame += frame->frame;
  section->weight += frame->weight;
-  section->spatial_layer_id = frame->spatial_layer_id;
  section->intra_error += frame->intra_error;
  section->coded_error += frame->coded_error;
  section->sr_coded_error += frame->sr_coded_error;
@ -292,15 +290,7 @@ void vp10_init_first_pass(VP10_COMP *cpi) {
 }

 void vp10_end_first_pass(VP10_COMP *cpi) {
-  if (is_two_pass_svc(cpi)) {
-    int i;
-    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-      output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
-                   cpi->output_pkt_list);
-    }
-  } else {
-    output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
-  }
+  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
 }

 static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@ -383,7 +373,7 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
 // for first pass test.
 static int get_search_range(const VP10_COMP *cpi) {
  int sr = 0;
-  const int dim = MIN(cpi->initial_width, cpi->initial_height);
+  const int dim = VPXMIN(cpi->initial_width, cpi->initial_height);

  while ((dim << sr) < MAX_FULL_PEL_VAL)
    ++sr;
@ -530,16 +520,13 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
-
-  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
-        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
  double intra_factor;
  double brightness_factor;
  BufferPool *const pool = cm->buffer_pool;

  // First pass code requires valid last and new frame buffers.
  assert(new_yv12 != NULL);
-  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));

 #if CONFIG_FP_MB_STATS
  if (cpi->use_fp_mb_stats) {
@ -556,51 +543,6 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
  set_first_pass_params(cpi);
  vp10_set_quantizer(cm, find_fp_qindex(cm->bit_depth));

-  if (lc != NULL) {
-    twopass = &lc->twopass;
-
-    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
-    cpi->ref_frame_flags = VP9_LAST_FLAG;
-
-    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
-        REF_FRAMES) {
-      cpi->gld_fb_idx =
-          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
-      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
-      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
-    } else {
-      cpi->refresh_golden_frame = 0;
-    }
-
-    if (lc->current_video_frame_in_layer == 0)
-      cpi->ref_frame_flags = 0;
-
-    vp10_scale_references(cpi);
-
-    // Use either last frame or alt frame for motion search.
-    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      first_ref_buf = vp10_get_scaled_ref_frame(cpi, LAST_FRAME);
-      if (first_ref_buf == NULL)
-        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
-    }
-
-    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      gld_yv12 = vp10_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
-      if (gld_yv12 == NULL) {
-        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-      }
-    } else {
-      gld_yv12 = NULL;
-    }
-
-    set_ref_ptrs(cm, xd,
-                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
-                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
-
-    cpi->Source = vp10_scale_if_required(cm, cpi->un_scaled_source,
-                                        &cpi->scaled_source);
-  }
-
  vp10_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);

  vp10_setup_src_planes(x, cpi->Source, 0, 0);
@ -672,7 +614,6 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
                     cm->mi_rows, cm->mi_cols);

      // Do intra 16x16 prediction.
-      x->skip_encode = 0;
      xd->mi[0]->mbmi.mode = DC_PRED;
      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
         (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
@ -754,8 +695,7 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;

      // Other than for the first frame do a motion search.
-      if ((lc == NULL && cm->current_video_frame > 0) ||
-          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
+      if (cm->current_video_frame > 0) {
        int tmp_err, motion_error, raw_motion_error;
        // Assume 0,0 motion with no mv overhead.
        MV mv = {0, 0} , tmp_mv = {0, 0};
@ -796,7 +736,7 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH

        // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 || lc != NULL) {
+        if (raw_motion_error > 25) {
          // Test last reference frame using the previous best mv as the
          // starting point (best reference) for the search.
          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
@ -814,9 +754,7 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
          }

          // Search in an older reference frame.
-          if (((lc == NULL && cm->current_video_frame > 1) ||
-               (lc != NULL && lc->current_video_frame_in_layer > 1))
-              && gld_yv12 != NULL) {
+          if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
            // Assume 0,0 motion with no mv overhead.
            int gf_motion_error;

@ -1026,7 +964,7 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
  // Exclude any image dead zone
  if (image_data_start_row > 0) {
    intra_skip_count =
-      MAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
  }

  {
@ -1045,7 +983,6 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
    fps.weight = intra_factor * brightness_factor;

    fps.frame = cm->current_video_frame;
-    fps.spatial_layer_id = cpi->svc.spatial_layer_id;
    fps.coded_error = (double)(coded_error >> 8) + min_err;
    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
    fps.intra_error = (double)(intra_error >> 8) + min_err;
@ -1116,18 +1053,13 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {

  vpx_extend_frame_borders(new_yv12);

-  if (lc != NULL) {
-    vp10_update_reference_frames(cpi);
-  } else {
-    // The frame we just compressed now becomes the last frame.
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
-               cm->new_fb_idx);
-  }
+  // The frame we just compressed now becomes the last frame.
+  ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+             cm->new_fb_idx);

  // Special case for the first frame. Copy into the GF buffer as a second
  // reference.
-  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
-      lc == NULL) {
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
               cm->ref_frame_map[cpi->lst_fb_idx]);
  }
@ -1149,8 +1081,6 @@ void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
  }

  ++cm->current_video_frame;
-  if (cpi->use_svc)
-    vp10_inc_frame_in_layer(cpi);
 }

 static double calc_correction_factor(double err_per_mb,
@ -1163,7 +1093,7 @@ static double calc_correction_factor(double err_per_mb,

  // Adjustment based on actual quantizer to power term.
  const double power_term =
-      MIN(vp10_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+      VPXMIN(vp10_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);

  // Calculate correction factor.
  if (power_term < 1.0)
@ -1192,7 +1122,7 @@ static int get_twopass_worst_quality(const VP10_COMP *cpi,
  } else {
    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                        ? cpi->initial_mbs : cpi->common.MBs;
-    const int active_mbs = MAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
    const double av_err_per_mb = section_err / active_mbs;
    const double speed_term = 1.0 + 0.04 * oxcf->speed;
    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
@ -1200,11 +1130,6 @@ static int get_twopass_worst_quality(const VP10_COMP *cpi,
                                         BPER_MB_NORMBITS) / active_mbs;

    int q;
-    int is_svc_upper_layer = 0;
-
-    if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
-      is_svc_upper_layer = 1;
-

    // Try and pick a max Q that will be high enough to encode the
    // content at the given rate.
@ -1212,7 +1137,6 @@ static int get_twopass_worst_quality(const VP10_COMP *cpi,
      const double factor =
          calc_correction_factor(av_err_per_mb,
                                 ERR_DIVISOR - ediv_size_correction,
-                                 is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                 FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
                                 cpi->common.bit_depth);
      const int bits_per_mb =
@ -1225,7 +1149,7 @@ static int get_twopass_worst_quality(const VP10_COMP *cpi,

    // Restriction on active max q for constrained quality mode.
    if (cpi->oxcf.rc_mode == VPX_CQ)
-      q = MAX(q, oxcf->cq_level);
+      q = VPXMAX(q, oxcf->cq_level);
    return q;
  }
 }
@ -1235,7 +1159,7 @@ static void setup_rf_level_maxq(VP10_COMP *cpi) {
  RATE_CONTROL *const rc = &cpi->rc;
  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
    int qdelta = vp10_frame_type_qdelta(cpi, i, rc->worst_quality);
-    rc->rf_level_maxq[i] = MAX(rc->worst_quality + qdelta, rc->best_quality);
+    rc->rf_level_maxq[i] = VPXMAX(rc->worst_quality + qdelta, rc->best_quality);
  }
 }

@ -1264,12 +1188,8 @@ void vp10_calculate_coded_size(VP10_COMP *cpi,
 }

 void vp10_init_second_pass(VP10_COMP *cpi) {
-  SVC *const svc = &cpi->svc;
  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
-  const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
-                              (svc->number_temporal_layers > 1);
-  TWO_PASS *const twopass = is_two_pass_svc ?
-      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->twopass;
  double frame_rate;
  FIRSTPASS_STATS *stats;

@ -1290,17 +1210,9 @@ void vp10_init_second_pass(VP10_COMP *cpi) {
  // encoded in the second pass is a guess. However, the sum duration is not.
  // It is calculated based on the actual durations of all frames from the
  // first pass.
-
-  if (is_two_pass_svc) {
-    vp10_update_spatial_layer_framerate(cpi, frame_rate);
-    twopass->bits_left = (int64_t)(stats->duration *
-        svc->layer_context[svc->spatial_layer_id].target_bandwidth /
-        10000000.0);
-  } else {
-    vp10_new_framerate(cpi, frame_rate);
-    twopass->bits_left = (int64_t)(stats->duration * oxcf->target_bandwidth /
-                             10000000.0);
-  }
+  vp10_new_framerate(cpi, frame_rate);
+  twopass->bits_left = (int64_t)(stats->duration * oxcf->target_bandwidth /
+                       10000000.0);

  // This variable monitors how far behind the second ref update is lagging.
  twopass->sr_update_lag = 1;
@ -1366,12 +1278,12 @@ static double get_sr_decay_rate(const VP10_COMP *cpi,


  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = MIN(sr_diff, SR_DIFF_MAX);
+    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
               (MOTION_AMP_PART * motion_amplitude_factor) -
               (INTRA_PART * modified_pcnt_intra);
  }
-  return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
 }

 // This function gives an estimate of how badly we believe the prediction
@ -1381,7 +1293,7 @@ static double get_zero_motion_factor(const VP10_COMP *cpi,
  const double zero_motion_pct = frame->pcnt_inter -
                                 frame->pcnt_motion;
  double sr_decay = get_sr_decay_rate(cpi, frame);
-  return MIN(sr_decay, zero_motion_pct);
+  return VPXMIN(sr_decay, zero_motion_pct);
 }

 #define ZM_POWER_FACTOR 0.75
@ -1393,8 +1305,8 @@ static double get_prediction_decay_rate(const VP10_COMP *cpi,
    (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
                ZM_POWER_FACTOR));

-  return MAX(zero_motion_factor,
-             (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+  return VPXMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
 }

 // Function to test for a condition where a complex transition is followed
@ -1485,12 +1397,12 @@ static double calc_frame_boost(VP10_COMP *cpi,
  const double lq =
    vp10_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
                            cpi->common.bit_depth);
-  const double boost_q_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+  const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                ? cpi->initial_mbs : cpi->common.MBs;

  // Correct for any inactive region in the image
-  num_mbs = (int)MAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));

  // Underlying boost factor is based on inter error ratio.
  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
@ -1506,7 +1418,7 @@ static double calc_frame_boost(VP10_COMP *cpi,
  else
    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);

-  return MIN(frame_boost, max_boost * boost_q_correction);
+  return VPXMIN(frame_boost, max_boost * boost_q_correction);
 }

 static int calc_arf_boost(VP10_COMP *cpi, int offset,
@ -1595,7 +1507,7 @@ static int calc_arf_boost(VP10_COMP *cpi, int offset,
  arf_boost = (*f_boost + *b_boost);
  if (arf_boost < ((b_frames + f_frames) * 20))
    arf_boost = ((b_frames + f_frames) * 20);
-  arf_boost = MAX(arf_boost, MIN_ARF_GF_BOOST);
+  arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);

  return arf_boost;
 }
@ -1666,7 +1578,8 @@ static int calculate_boost_bits(int frame_count,
  }

  // Calculate the number of extra bits for use in the boosted frame or frames.
-  return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
+  return VPXMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
 }

 // Current limit on maximum number of active arfs in a GF/ARF group.
@ -1700,15 +1613,8 @@ static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
  int mid_frame_idx;
  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
  int alt_frame_index = frame_index;
-  int has_temporal_layers = is_two_pass_svc(cpi) &&
-                            cpi->svc.number_temporal_layers > 1;

-  // Only encode alt reference frame in temporal base layer.
-  if (has_temporal_layers)
-    alt_frame_index = cpi->svc.number_temporal_layers;
-
-  key_frame = cpi->common.frame_type == KEY_FRAME ||
-              vp10_is_upper_layer_key_frame(cpi);
+  key_frame = cpi->common.frame_type == KEY_FRAME;

  get_arf_buffer_indices(arf_buffer_indices);

@ -1745,20 +1651,14 @@ static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
    gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
    gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;

-    if (has_temporal_layers)
-      gf_group->arf_src_offset[alt_frame_index] =
-          (unsigned char)(rc->baseline_gf_interval -
-                          cpi->svc.number_temporal_layers);
-    else
-      gf_group->arf_src_offset[alt_frame_index] =
-          (unsigned char)(rc->baseline_gf_interval - 1);
+    gf_group->arf_src_offset[alt_frame_index] =
+        (unsigned char)(rc->baseline_gf_interval - 1);

    gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
    gf_group->arf_ref_idx[alt_frame_index] =
      arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                         rc->source_alt_ref_active];
-    if (!has_temporal_layers)
-      ++frame_index;
+    ++frame_index;

    if (cpi->multi_arf_enabled) {
      // Set aside a slot for a level 1 arf.
@ -1781,10 +1681,6 @@ static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
    if (EOF == input_stats(twopass, &frame_stats))
      break;

-    if (has_temporal_layers && frame_index == alt_frame_index) {
-      ++frame_index;
-    }
-
    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);

    if (group_error > 0)
@ -1805,7 +1701,7 @@ static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];

    target_frame_size = clamp(target_frame_size, 0,
-                              MIN(max_bits, (int)total_group_bits));
+                              VPXMIN(max_bits, (int)total_group_bits));

    gf_group->update_type[frame_index] = LF_UPDATE;
    gf_group->rf_level[frame_index] = INTER_NORMAL;
@ -1926,7 +1822,7 @@ static void define_gf_group(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    int int_lbq =
      (int)(vp10_convert_qindex_to_q(rc->last_boosted_qindex,
                                   cpi->common.bit_depth));
-    active_min_gf_interval = rc->min_gf_interval + MIN(2, int_max_q / 200);
+    active_min_gf_interval = rc->min_gf_interval + VPXMIN(2, int_max_q / 200);
    if (active_min_gf_interval > rc->max_gf_interval)
      active_min_gf_interval = rc->max_gf_interval;

@ -1937,7 +1833,7 @@ static void define_gf_group(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      // bits to spare and are better with a smaller interval and smaller boost.
      // At high Q when there are few bits to spare we are better with a longer
      // interval to spread the cost of the GF.
-      active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
+      active_max_gf_interval = 12 + VPXMIN(4, (int_lbq / 6));
      if (active_max_gf_interval < active_min_gf_interval)
        active_max_gf_interval = active_min_gf_interval;

@ -1982,8 +1878,8 @@ static void define_gf_group(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      decay_accumulator = decay_accumulator * loop_decay_rate;

      // Monitor for static sections.
-      zero_motion_accumulator =
-        MIN(zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));

      // Break clause to detect very still sections after motion. For example,
      // a static image after a fade or other transition.
@ -2039,33 +1935,13 @@ static void define_gf_group(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
      (zero_motion_accumulator < 0.995)) ? 1 : 0;
  } else {
-    rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
    rc->source_alt_ref_pending = 0;
  }

  // Set the interval until the next gf.
  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);

-  // Only encode alt reference frame in temporal base layer. So
-  // baseline_gf_interval should be multiple of a temporal layer group
-  // (typically the frame distance between two base layer frames)
-  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
-    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
-    int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
-    int j;
-    for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
-      if (EOF == input_stats(twopass, this_frame))
-        break;
-      gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-#if GROUP_ADAPTIVE_MAXQ
-      gf_group_raw_error += this_frame->coded_error;
-#endif
-      gf_group_skip_pct += this_frame->intra_skip_pct;
-      gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
-    }
-    rc->baseline_gf_interval = new_gf_interval;
-  }
-
  rc->frames_till_gf_update_due = rc->baseline_gf_interval;

  // Reset the file position.
@ -2094,11 +1970,11 @@ static void define_gf_group(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    // rc factor is a weight factor that corrects for local rate control drift.
    double rc_factor = 1.0;
    if (rc->rate_error_estimate > 0) {
-      rc_factor = MAX(RC_FACTOR_MIN,
-                      (double)(100 - rc->rate_error_estimate) / 100.0);
+      rc_factor = VPXMAX(RC_FACTOR_MIN,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
    } else {
-      rc_factor = MIN(RC_FACTOR_MAX,
-                      (double)(100 - rc->rate_error_estimate) / 100.0);
+      rc_factor = VPXMIN(RC_FACTOR_MAX,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
    }
    tmp_q =
      get_twopass_worst_quality(cpi, group_av_err,
@ -2106,7 +1982,7 @@ static void define_gf_group(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                vbr_group_bits_per_frame,
                                twopass->kfgroup_inter_fraction * rc_factor);
    twopass->active_worst_quality =
-      MAX(tmp_q, twopass->active_worst_quality >> 1);
+      VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
  }
 #endif

@ -2385,18 +2261,6 @@ static void find_next_key_frame(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    rc->next_key_frame_forced = 0;
  }

-  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
-    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
-    int new_frame_to_key = (rc->frames_to_key + count) & (~count);
-    int j;
-    for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
-      if (EOF == input_stats(twopass, this_frame))
-        break;
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-    }
-    rc->frames_to_key = new_frame_to_key;
-  }
-
  // Special case for the last key frame of the file.
  if (twopass->stats_in >= twopass->stats_in_end) {
    // Accumulate kf group error.
@ -2423,7 +2287,7 @@ static void find_next_key_frame(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  } else {
    twopass->kf_group_bits = 0;
  }
-  twopass->kf_group_bits = MAX(0, twopass->kf_group_bits);
+  twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits);

  // Reset the first pass file position.
  reset_fpf_position(twopass, start_position);
@ -2437,9 +2301,8 @@ static void find_next_key_frame(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      break;

    // Monitor for static sections.
-    zero_motion_accumulator =
-      MIN(zero_motion_accumulator,
-          get_zero_motion_factor(cpi, &next_frame));
+    zero_motion_accumulator = VPXMIN(
+        zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));

    // Not all frames in the group are necessarily used in calculating boost.
    if ((i <= rc->max_gf_interval) ||
@ -2452,7 +2315,7 @@ static void find_next_key_frame(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
        const double loop_decay_rate =
          get_prediction_decay_rate(cpi, &next_frame);
        decay_accumulator *= loop_decay_rate;
-        decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
+        decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
        av_decay_accumulator += decay_accumulator;
        ++loop_decay_counter;
      }
@ -2473,8 +2336,8 @@ static void find_next_key_frame(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {

  // Apply various clamps for min and max boost
  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
-  rc->kf_boost = MAX(rc->kf_boost, (rc->frames_to_key * 3));
-  rc->kf_boost = MAX(rc->kf_boost, MIN_KF_BOOST);
+  rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);

  // Work out how many bits to allocate for the key frame itself.
  kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
@ -2547,16 +2410,6 @@ static void configure_buffer_updates(VP10_COMP *cpi) {
      assert(0);
      break;
  }
-  if (is_two_pass_svc(cpi)) {
-    if (cpi->svc.temporal_layer_id > 0) {
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-    }
-    if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
-      cpi->refresh_golden_frame = 0;
-    if (cpi->alt_ref_source == NULL)
-      cpi->refresh_alt_ref_frame = 0;
-  }
 }

 static int is_skippable_frame(const VP10_COMP *cpi) {
@ -2564,9 +2417,7 @@ static int is_skippable_frame(const VP10_COMP *cpi) {
  // first  pass, and so do its previous and forward frames, then this frame
  // can be skipped for partition check, and the partition size is assigned
  // according to the variance
-  const SVC *const svc = &cpi->svc;
-  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
-      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+  const TWO_PASS *const twopass = &cpi->twopass;

  return (!frame_is_intra_only(&cpi->common) &&
    twopass->stats_in - 2 > twopass->stats_in_start &&
@ -2587,16 +2438,9 @@ void vp10_rc_get_second_pass_params(VP10_COMP *cpi) {
  FIRSTPASS_STATS this_frame;

  int target_rate;
-  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
-        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;

-  if (lc != NULL) {
-    frames_left = (int)(twopass->total_stats.count -
-                  lc->current_video_frame_in_layer);
-  } else {
-    frames_left = (int)(twopass->total_stats.count -
-                  cm->current_video_frame);
-  }
+  frames_left = (int)(twopass->total_stats.count -
+                cm->current_video_frame);

  if (!twopass->stats_in)
    return;
@ -2612,21 +2456,9 @@ void vp10_rc_get_second_pass_params(VP10_COMP *cpi) {

    cm->frame_type = INTER_FRAME;

-    if (lc != NULL) {
-      if (cpi->svc.spatial_layer_id == 0) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
-
-        if (lc->is_key_frame)
-          cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-      }
-    }
-
    // Do the firstpass stats indicate that this frame is skippable for the
    // partition search?
-    if (cpi->sf.allow_partition_search_skip &&
-        cpi->oxcf.pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
    }

@ -2637,8 +2469,7 @@ void vp10_rc_get_second_pass_params(VP10_COMP *cpi) {

  if (cpi->oxcf.rc_mode == VPX_Q) {
    twopass->active_worst_quality = cpi->oxcf.cq_level;
-  } else if (cm->current_video_frame == 0 ||
-             (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+  } else if (cm->current_video_frame == 0) {
    // Special case code for first frame.
    const int section_target_bandwidth = (int)(twopass->bits_left /
                                               frames_left);
@ -2685,34 +2516,11 @@ void vp10_rc_get_second_pass_params(VP10_COMP *cpi) {
    cm->frame_type = INTER_FRAME;
  }

-  if (lc != NULL) {
-    if (cpi->svc.spatial_layer_id == 0) {
-      lc->is_key_frame = (cm->frame_type == KEY_FRAME);
-      if (lc->is_key_frame) {
-        cpi->ref_frame_flags &=
-            (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-        lc->frames_from_key_frame = 0;
-        // Encode an intra only empty frame since we have a key frame.
-        cpi->svc.encode_intra_empty_frame = 1;
-      }
-    } else {
-      cm->frame_type = INTER_FRAME;
-      lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
-
-      if (lc->is_key_frame) {
-        cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-        lc->frames_from_key_frame = 0;
-      }
-    }
-  }
-
  // Define a new GF/ARF group. (Should always enter here for key frames).
  if (rc->frames_till_gf_update_due == 0) {
    define_gf_group(cpi, &this_frame);

    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    if (lc != NULL)
-      cpi->refresh_golden_frame = 1;

 #if ARF_STATS_OUTPUT
    {
@ -2732,8 +2540,7 @@ void vp10_rc_get_second_pass_params(VP10_COMP *cpi) {

  // Do the firstpass stats indicate that this frame is skippable for the
  // partition search?
-  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
  }

@ -2772,7 +2579,7 @@ void vp10_twopass_postencode_update(VP10_COMP *cpi) {
  // is designed to prevent extreme behaviour at the end of a clip
  // or group of frames.
  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
-  twopass->bits_left = MAX(twopass->bits_left - bits_used, 0);
+  twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);

  // Calculate the pct rc error.
  if (rc->total_actual_bits) {
@ -2783,12 +2590,11 @@ void vp10_twopass_postencode_update(VP10_COMP *cpi) {
    rc->rate_error_estimate = 0;
  }

-  if (cpi->common.frame_type != KEY_FRAME &&
-      !vp10_is_upper_layer_key_frame(cpi)) {
+  if (cpi->common.frame_type != KEY_FRAME) {
    twopass->kf_group_bits -= bits_used;
    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
  }
-  twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
+  twopass->kf_group_bits = VPXMAX(twopass->kf_group_bits, 0);

  // Increment the gf group index ready for the next frame.
  ++twopass->gf_group.index;
@ -2838,18 +2644,18 @@ void vp10_twopass_postencode_update(VP10_COMP *cpi) {
        rc->vbr_bits_off_target_fast +=
          fast_extra_thresh - rc->projected_frame_size;
        rc->vbr_bits_off_target_fast =
-          MIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+          VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));

        // Fast adaptation of minQ if necessary to use up the extra bits.
        if (rc->avg_frame_bandwidth) {
          twopass->extend_minq_fast =
            (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
        }
-        twopass->extend_minq_fast = MIN(twopass->extend_minq_fast,
-                                        minq_adj_limit - twopass->extend_minq);
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
      } else if (rc->vbr_bits_off_target_fast) {
-        twopass->extend_minq_fast = MIN(twopass->extend_minq_fast,
-                                        minq_adj_limit - twopass->extend_minq);
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
      } else {
        twopass->extend_minq_fast = 0;
      }
--- a/vp10/encoder/firstpass.h
+++ b/vp10/encoder/firstpass.h
@ -64,7 +64,6 @@ typedef struct {
  double new_mv_count;
  double duration;
  double count;
-  int64_t spatial_layer_id;
 } FIRSTPASS_STATS;

 typedef enum {
--- a/vp10/encoder/lookahead.h
+++ b/vp10/encoder/lookahead.h
@ -14,11 +14,6 @@
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"

-#if CONFIG_SPATIAL_SVC
-#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/vp10/encoder/mbgraph.c
+++ b/vp10/encoder/mbgraph.c
@ -13,6 +13,7 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/system_state.h"
 #include "vp10/encoder/segmentation.h"
@ -41,7 +42,7 @@ static unsigned int do_16x16_motion_iteration(VP10_COMP *cpi,

  // Further step/diamond searches as necessary
  int step_param = mv_sf->reduce_first_step_size;
-  step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2);
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);

  vp10_set_mv_search_range(x, ref_mv);

--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"

@ -37,10 +38,10 @@ void vp10_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;

-  col_min = MAX(col_min, (MV_LOW >> 3) + 1);
-  row_min = MAX(row_min, (MV_LOW >> 3) + 1);
-  col_max = MIN(col_max, (MV_UPP >> 3) - 1);
-  row_max = MIN(row_max, (MV_UPP >> 3) - 1);
+  col_min = VPXMAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = VPXMAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = VPXMIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = VPXMIN(row_max, (MV_UPP >> 3) - 1);

  // Get intersection of UMV window and valid MV window to reduce # of checks
  // in diamond search.
@ -57,12 +58,12 @@ void vp10_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
 int vp10_init_search_range(int size) {
  int sr = 0;
  // Minimum search size no matter what the passed in value.
-  size = MAX(16, size);
+  size = VPXMAX(16, size);

  while ((size << sr) < MAX_FULL_PEL_VAL)
    sr++;

-  sr = MIN(sr, MAX_MVSEARCH_STEPS - 2);
+  sr = VPXMIN(sr, MAX_MVSEARCH_STEPS - 2);
  return sr;
 }

@ -297,10 +298,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
  int br = bestmv->row * 8;                                                \
  int bc = bestmv->col * 8;                                                \
  int hstep = 4;                                                           \
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);           \
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);           \
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);           \
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);           \
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);        \
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);        \
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);        \
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);        \
  int tr = br;                                                             \
  int tc = bc;                                                             \
                                                                           \
@ -668,10 +669,10 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
  int bc = bestmv->col * 8;
  int hstep = 4;
  int iter, round = 3 - forced_stop;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
  int tr = br;
  int tc = bc;
  const MV *search_step = search_step_table;
@ -1500,9 +1501,9 @@ int vp10_fast_hex_search(const MACROBLOCK *x,
                        int use_mvcost,
                        const MV *center_mv,
                        MV *best_mv) {
-  return vp10_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                        sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
-                        center_mv, best_mv);
+  return vp10_hex_search(
+      x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
+      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
 }

 int vp10_fast_dia_search(const MACROBLOCK *x,
@ -1515,9 +1516,9 @@ int vp10_fast_dia_search(const MACROBLOCK *x,
                        int use_mvcost,
                        const MV *center_mv,
                        MV *best_mv) {
-  return vp10_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                           sad_per_bit, do_init_search, cost_list, vfp,
-                           use_mvcost, center_mv, best_mv);
+  return vp10_bigdia_search(
+      x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
+      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
 }

 #undef CHECK_BETTER
@ -1547,10 +1548,10 @@ int vp10_full_range_search_c(const MACROBLOCK *x,
  best_sad = fn_ptr->sdf(what->buf, what->stride,
                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
                 mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  start_row = MAX(-range, x->mv_row_min - ref_mv->row);
-  start_col = MAX(-range, x->mv_col_min - ref_mv->col);
-  end_row = MIN(range, x->mv_row_max - ref_mv->row);
-  end_col = MIN(range, x->mv_col_max - ref_mv->col);
+  start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row);
+  start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col);
+  end_row = VPXMIN(range, x->mv_row_max - ref_mv->row);
+  end_col = VPXMIN(range, x->mv_col_max - ref_mv->col);

  for (r = start_row; r <= end_row; ++r) {
    for (c = start_col; c <= end_col; c += 4) {
@ -2021,10 +2022,10 @@ int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
  const MACROBLOCKD *const xd = &x->e_mbd;
  const struct buf_2d *const what = &x->plane[0].src;
  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
  int best_sad = fn_ptr->sdf(what->buf, what->stride,
      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
@ -2054,10 +2055,10 @@ int vp10_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
  const MACROBLOCKD *const xd = &x->e_mbd;
  const struct buf_2d *const what = &x->plane[0].src;
  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
@ -2119,10 +2120,10 @@ int vp10_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
  const MACROBLOCKD *const xd = &x->e_mbd;
  const struct buf_2d *const what = &x->plane[0].src;
  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
--- a/vp10/encoder/picklpf.c
+++ b/vp10/encoder/picklpf.c
@ -13,6 +13,7 @@

 #include "./vpx_scale_rtcd.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"

@ -92,8 +93,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
  ss_err[filt_mid] = best_err;

  while (filter_step > 0) {
-    const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
-    const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
+    const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level);

    // Bias against raising loop filter in favor of lowering it.
    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
--- a/vp10/encoder/pickmode.c
+++ b/vp10/encoder/pickmode.c
--- a/vp10/encoder/pickmode.h
+++ b/vp10/encoder/pickmode.h
@ -1,38 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP10_ENCODER_PICKMODE_H_
-#define VP10_ENCODER_PICKMODE_H_
-
-#include "vp10/encoder/encoder.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vp10_pick_intra_mode(VP10_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
-                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
-
-void vp10_pick_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
-                         TileDataEnc *tile_data,
-                         int mi_row, int mi_col, RD_COST *rd_cost,
-                         BLOCK_SIZE bsize,
-                         PICK_MODE_CONTEXT *ctx);
-
-void vp10_pick_inter_mode_sub8x8(VP10_COMP *cpi, MACROBLOCK *x,
-                                int mi_row, int mi_col, RD_COST *rd_cost,
-                                BLOCK_SIZE bsize,
-                                PICK_MODE_CONTEXT *ctx);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP10_ENCODER_PICKMODE_H_
--- a/vp10/encoder/ratectrl.c
+++ b/vp10/encoder/ratectrl.c
@ -15,6 +15,7 @@
 #include <stdlib.h>
 #include <string.h>

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@ -106,8 +107,7 @@ static int kf_low = 400;
 static int get_minq_index(double maxq, double x3, double x2, double x1,
                          vpx_bit_depth_t bit_depth) {
  int i;
-  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq,
-                                maxq);
+  const double minqtarget = VPXMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);

  // Special case handling to deal with the step from q2.0
  // down to lossless mode represented by q 1.0.
@ -192,15 +192,15 @@ int vp10_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
                           vpx_bit_depth_t bit_depth) {
  const int bpm = (int)(vp10_rc_bits_per_mb(frame_type, q, correction_factor,
                                           bit_depth));
-  return MAX(FRAME_OVERHEAD_BITS,
-             (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+  return VPXMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }

 int vp10_rc_clamp_pframe_target_size(const VP10_COMP *const cpi, int target) {
  const RATE_CONTROL *rc = &cpi->rc;
  const VP10EncoderConfig *oxcf = &cpi->oxcf;
-  const int min_frame_target = MAX(rc->min_frame_bandwidth,
-                                   rc->avg_frame_bandwidth >> 5);
+  const int min_frame_target = VPXMAX(rc->min_frame_bandwidth,
+                                      rc->avg_frame_bandwidth >> 5);
  if (target < min_frame_target)
    target = min_frame_target;
  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
@ -216,7 +216,7 @@ int vp10_rc_clamp_pframe_target_size(const VP10_COMP *const cpi, int target) {
  if (oxcf->rc_max_inter_bitrate_pct) {
    const int max_rate = rc->avg_frame_bandwidth *
                         oxcf->rc_max_inter_bitrate_pct / 100;
-    target = MIN(target, max_rate);
+    target = VPXMIN(target, max_rate);
  }
  return target;
 }
@ -227,34 +227,13 @@ int vp10_rc_clamp_iframe_target_size(const VP10_COMP *const cpi, int target) {
  if (oxcf->rc_max_intra_bitrate_pct) {
    const int max_rate = rc->avg_frame_bandwidth *
                             oxcf->rc_max_intra_bitrate_pct / 100;
-    target = MIN(target, max_rate);
+    target = VPXMIN(target, max_rate);
  }
  if (target > rc->max_frame_bandwidth)
    target = rc->max_frame_bandwidth;
  return target;
 }

-// Update the buffer level for higher temporal layers, given the encoded current
-// temporal layer.
-static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
-  int i = 0;
-  int current_temporal_layer = svc->temporal_layer_id;
-  for (i = current_temporal_layer + 1;
-      i < svc->number_temporal_layers; ++i) {
-    const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
-                                       svc->number_temporal_layers);
-    LAYER_CONTEXT *lc = &svc->layer_context[layer];
-    RATE_CONTROL *lrc = &lc->rc;
-    int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
-        encoded_frame_size);
-    lrc->bits_off_target += bits_off_for_this_layer;
-
-    // Clip buffer level to maximum buffer size for the layer.
-    lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-    lrc->buffer_level = lrc->bits_off_target;
-  }
-}
-
 // Update the buffer level: leaky bucket model.
 static void update_buffer_level(VP10_COMP *cpi, int encoded_frame_size) {
  const VP10_COMMON *const cm = &cpi->common;
@ -268,12 +247,8 @@ static void update_buffer_level(VP10_COMP *cpi, int encoded_frame_size) {
  }

  // Clip the buffer level to the maximum specified buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
  rc->buffer_level = rc->bits_off_target;
-
-  if (is_one_pass_cbr_svc(cpi)) {
-    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
-  }
 }

 int vp10_rc_get_default_min_gf_interval(
@ -287,8 +262,8 @@ int vp10_rc_get_default_min_gf_interval(
  if (factor <= factor_safe)
    return default_interval;
  else
-    return MAX(default_interval,
-               (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+    return VPXMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
  // Note this logic makes:
  // 4K24: 5
  // 4K30: 6
@ -296,9 +271,9 @@ int vp10_rc_get_default_min_gf_interval(
 }

 int vp10_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
-  int interval = MIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
  interval += (interval & 0x01);  // Round to even value
-  return MAX(interval, min_gf_interval);
+  return VPXMAX(interval, min_gf_interval);
 }

 void vp10_rc_init(const VP10EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
@ -408,7 +383,7 @@ static double get_rate_correction_factor(const VP10_COMP *cpi) {
    rcf = rc->rate_correction_factors[rf_lvl];
  } else {
    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        !rc->is_src_frame_alt_ref &&
        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
      rcf = rc->rate_correction_factors[GF_ARF_STD];
    else
@ -434,7 +409,7 @@ static void set_rate_correction_factor(VP10_COMP *cpi, double factor) {
    rc->rate_correction_factors[rf_lvl] = factor;
  } else {
    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        !rc->is_src_frame_alt_ref &&
        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
      rc->rate_correction_factors[GF_ARF_STD] = factor;
    else
@ -478,7 +453,7 @@ void vp10_rc_update_rate_correction_factors(VP10_COMP *cpi) {
  // More heavily damped adjustment used if we have been oscillating either side
  // of target.
  adjustment_limit = 0.25 +
-      0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
+      0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));

  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
  cpi->rc.q_1_frame = cm->base_qindex;
@ -529,10 +504,7 @@ int vp10_rc_regulate_q(const VP10_COMP *cpi, int target_bits_per_frame,
  i = active_best_quality;

  do {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cm->seg.enabled &&
-        cpi->svc.temporal_layer_id == 0 &&
-        cpi->svc.spatial_layer_id == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
      bits_per_mb_at_this_q =
          (int)vp10_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
    } else {
@ -558,8 +530,8 @@ int vp10_rc_regulate_q(const VP10_COMP *cpi, int target_bits_per_frame,
  if (cpi->oxcf.rc_mode == VPX_CBR &&
      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
-    q = clamp(q, MIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
-              MAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+    q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
  }
  return q;
 }
@ -617,7 +589,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP10_COMP *cpi) {
                                             : rc->last_q[INTER_FRAME] * 2;
    }
  }
-  return MIN(active_worst_quality, rc->worst_quality);
+  return VPXMIN(active_worst_quality, rc->worst_quality);
 }

 // Adjust active_worst_quality level based on buffer level.
@ -643,10 +615,10 @@ static int calc_active_worst_quality_one_pass_cbr(const VP10_COMP *cpi) {
  // So for first few frames following key, the qp of that key frame is weighted
  // into the active_worst_quality setting.
  ambient_qp = (cm->current_video_frame < 5) ?
-      MIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) :
-      rc->avg_frame_qindex[INTER_FRAME];
-  active_worst_quality = MIN(rc->worst_quality,
-                             ambient_qp * 5 / 4);
+                   VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
+                          rc->avg_frame_qindex[KEY_FRAME]) :
+                   rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 / 4);
  if (rc->buffer_level > rc->optimal_buffer_level) {
    // Adjust down.
    // Maximum limit for down adjustment, ~30%.
@ -699,7 +671,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP10_COMP *cpi,
      int delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
                                            (last_boosted_q * 0.75),
                                            cm->bit_depth);
-      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
    } else if (cm->current_video_frame > 0) {
      // not first frame of one pass and kf_boost is set
      double q_adj_factor = 1.0;
@ -722,7 +694,6 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP10_COMP *cpi,
                                                cm->bit_depth);
    }
  } else if (!rc->is_src_frame_alt_ref &&
-             !cpi->use_svc &&
             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
    // Use the lower of active_worst_quality and recent
    // average Q as basis for GF/ARF best Q limit unless last frame was
@ -833,7 +804,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP10_COMP *cpi,
      int delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
                                            last_boosted_q * 0.75,
                                            cm->bit_depth);
-      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
    } else {
      // not first frame of one pass and kf_boost is set
      double q_adj_factor = 1.0;
@ -992,7 +963,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP10_COMP *cpi,
  int *inter_minq;
  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);

-  if (frame_is_intra_only(cm) || vp10_is_upper_layer_key_frame(cpi)) {
+  if (frame_is_intra_only(cm)) {
    // Handle the special case for key frames forced when we have reached
    // the maximum key frame interval. Here force the Q to a range
    // based on the ambient Q to reduce the risk of popping.
@ -1002,21 +973,21 @@ static int rc_pick_q_and_bounds_two_pass(const VP10_COMP *cpi,
      int qindex;

      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-        qindex = MIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
        active_best_quality = qindex;
        last_boosted_q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
        delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
                                              last_boosted_q * 1.25,
                                              cm->bit_depth);
-        active_worst_quality = MIN(qindex + delta_qindex, active_worst_quality);
-
+        active_worst_quality =
+            VPXMIN(qindex + delta_qindex, active_worst_quality);
      } else {
        qindex = rc->last_boosted_qindex;
        last_boosted_q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
        delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
                                              last_boosted_q * 0.75,
                                              cm->bit_depth);
-        active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+        active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
      }
    } else {
      // Not forced keyframe.
@ -1111,13 +1082,13 @@ static int rc_pick_q_and_bounds_two_pass(const VP10_COMP *cpi,
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
  vpx_clear_system_state();
  // Static forced key frames Q restrictions dealt with elsewhere.
-  if (!((frame_is_intra_only(cm) || vp10_is_upper_layer_key_frame(cpi))) ||
+  if (!(frame_is_intra_only(cm)) ||
      !rc->this_key_frame_forced ||
      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
    int qdelta = vp10_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
                                       active_worst_quality);
-    active_worst_quality = MAX(active_worst_quality + qdelta,
-                               active_best_quality);
+    active_worst_quality = VPXMAX(active_worst_quality + qdelta,
+                                  active_best_quality);
  }
 #endif

@ -1126,7 +1097,8 @@ static int rc_pick_q_and_bounds_two_pass(const VP10_COMP *cpi,
    int qdelta = vp10_compute_qdelta_by_rate(rc, cm->frame_type,
                                            active_best_quality, 2.0,
                                            cm->bit_depth);
-    active_best_quality = MAX(active_best_quality + qdelta, rc->best_quality);
+    active_best_quality =
+        VPXMAX(active_best_quality + qdelta, rc->best_quality);
  }

  active_best_quality = clamp(active_best_quality,
@ -1137,11 +1109,10 @@ static int rc_pick_q_and_bounds_two_pass(const VP10_COMP *cpi,
  if (oxcf->rc_mode == VPX_Q) {
    q = active_best_quality;
  // Special case code to try and match quality with forced key frames.
-  } else if ((frame_is_intra_only(cm) || vp10_is_upper_layer_key_frame(cpi)) &&
-             rc->this_key_frame_forced) {
+  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
    // If static since last kf use better of last boosted and last kf q.
    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      q = MIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+      q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
    } else {
      q = rc->last_boosted_qindex;
    }
@ -1180,15 +1151,7 @@ int vp10_rc_pick_q_and_bounds(const VP10_COMP *cpi,
  } else {
    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
  }
-  if (cpi->sf.use_nonrd_pick_mode) {
-    if (cpi->sf.force_frame_boost == 1)
-      q -= cpi->sf.max_delta_qindex;

-    if (q < *bottom_index)
-      *bottom_index = q;
-    else if (q > *top_index)
-      *top_index = q;
-  }
  return q;
 }

@ -1203,9 +1166,9 @@ void vp10_rc_compute_frame_size_bounds(const VP10_COMP *cpi,
    // For very small rate targets where the fractional adjustment
    // may be tiny make sure there is at least a minimum range.
    const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
-    *frame_under_shoot_limit = MAX(frame_target - tolerance - 200, 0);
-    *frame_over_shoot_limit = MIN(frame_target + tolerance + 200,
-                                  cpi->rc.max_frame_bandwidth);
+    *frame_under_shoot_limit = VPXMAX(frame_target - tolerance - 200, 0);
+    *frame_over_shoot_limit = VPXMIN(frame_target + tolerance + 200,
+                                     cpi->rc.max_frame_bandwidth);
  }
 }

@ -1288,8 +1251,7 @@ void vp10_rc_postencode_update(VP10_COMP *cpi, uint64_t bytes_used) {
        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
  } else {
    if (rc->is_src_frame_alt_ref ||
-        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
-        (cpi->use_svc && oxcf->rc_mode == VPX_CBR)) {
+        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
      rc->last_q[INTER_FRAME] = qindex;
      rc->avg_frame_qindex[INTER_FRAME] =
        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
@ -1441,10 +1403,10 @@ void vp10_rc_get_one_pass_vbr_params(VP10_COMP *cpi) {
 static int calc_pframe_target_size_one_pass_cbr(const VP10_COMP *cpi) {
  const VP10EncoderConfig *oxcf = &cpi->oxcf;
  const RATE_CONTROL *rc = &cpi->rc;
-  const SVC *const svc = &cpi->svc;
  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
-  int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int min_frame_target =
+      VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
  int target;

  if (oxcf->gf_cbr_boost_pct) {
@ -1457,38 +1419,27 @@ static int calc_pframe_target_size_one_pass_cbr(const VP10_COMP *cpi) {
  } else {
    target = rc->avg_frame_bandwidth;
  }
-  if (is_one_pass_cbr_svc(cpi)) {
-    // Note that for layers, avg_frame_bandwidth is the cumulative
-    // per-frame-bandwidth. For the target size of this frame, use the
-    // layer average frame size (i.e., non-cumulative per-frame-bw).
-    int layer =
-        LAYER_IDS_TO_IDX(svc->spatial_layer_id,
-            svc->temporal_layer_id, svc->number_temporal_layers);
-    const LAYER_CONTEXT *lc = &svc->layer_context[layer];
-    target = lc->avg_frame_size;
-    min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
-  }
+
  if (diff > 0) {
    // Lower the target bandwidth for this frame.
-    const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
    target -= (target * pct_low) / 200;
  } else if (diff < 0) {
    // Increase the target bandwidth for this frame.
-    const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    const int pct_high =
+        (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
    target += (target * pct_high) / 200;
  }
  if (oxcf->rc_max_inter_bitrate_pct) {
    const int max_rate = rc->avg_frame_bandwidth *
                         oxcf->rc_max_inter_bitrate_pct / 100;
-    target = MIN(target, max_rate);
+    target = VPXMIN(target, max_rate);
  }
-  return MAX(min_frame_target, target);
+  return VPXMAX(min_frame_target, target);
 }

 static int calc_iframe_target_size_one_pass_cbr(const VP10_COMP *cpi) {
  const RATE_CONTROL *rc = &cpi->rc;
-  const VP10EncoderConfig *oxcf = &cpi->oxcf;
-  const SVC *const svc = &cpi->svc;
  int target;
  if (cpi->common.current_video_frame == 0) {
    target = ((rc->starting_buffer_level / 2) > INT_MAX)
@ -1496,15 +1447,8 @@ static int calc_iframe_target_size_one_pass_cbr(const VP10_COMP *cpi) {
  } else {
    int kf_boost = 32;
    double framerate = cpi->framerate;
-    if (svc->number_temporal_layers > 1 &&
-        oxcf->rc_mode == VPX_CBR) {
-      // Use the layer framerate for temporal layers CBR mode.
-      const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id,
-          svc->temporal_layer_id, svc->number_temporal_layers);
-      const LAYER_CONTEXT *lc = &svc->layer_context[layer];
-      framerate = lc->framerate;
-    }
-    kf_boost = MAX(kf_boost, (int)(2 * framerate - 16));
+
+    kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16));
    if (rc->frames_since_key <  framerate / 2) {
      kf_boost = (int)(kf_boost * rc->frames_since_key /
                       (framerate / 2));
@ -1514,82 +1458,6 @@ static int calc_iframe_target_size_one_pass_cbr(const VP10_COMP *cpi) {
  return vp10_rc_clamp_iframe_target_size(cpi, target);
 }

-// Reset information needed to set proper reference frames and buffer updates
-// for temporal layering. This is called when a key frame is encoded.
-static void reset_temporal_layer_to_zero(VP10_COMP *cpi) {
-  int sl;
-  LAYER_CONTEXT *lc = NULL;
-  cpi->svc.temporal_layer_id = 0;
-
-  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
-    lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
-    lc->current_video_frame_in_layer = 0;
-    lc->frames_from_key_frame = 0;
-  }
-}
-
-void vp10_rc_get_svc_params(VP10_COMP *cpi) {
-  VP10_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  int target = rc->avg_frame_bandwidth;
-  const int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
-      cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
-
-  if ((cm->current_video_frame == 0) ||
-      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (rc->frames_since_key %
-          cpi->oxcf.key_freq == 0))) {
-    cm->frame_type = KEY_FRAME;
-    rc->source_alt_ref_active = 0;
-
-    if (is_two_pass_svc(cpi)) {
-      cpi->svc.layer_context[layer].is_key_frame = 1;
-      cpi->ref_frame_flags &=
-          (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-    } else if (is_one_pass_cbr_svc(cpi)) {
-      cpi->svc.layer_context[layer].is_key_frame = 1;
-      reset_temporal_layer_to_zero(cpi);
-      cpi->ref_frame_flags &=
-                (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-      // Assumption here is that LAST_FRAME is being updated for a keyframe.
-      // Thus no change in update flags.
-      target = calc_iframe_target_size_one_pass_cbr(cpi);
-    }
-  } else {
-    cm->frame_type = INTER_FRAME;
-    if (is_two_pass_svc(cpi)) {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      if (cpi->svc.spatial_layer_id == 0) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame =
-            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
-        if (lc->is_key_frame)
-          cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-      }
-      cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
-    } else if (is_one_pass_cbr_svc(cpi)) {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      if (cpi->svc.spatial_layer_id == 0) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame =
-            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
-      }
-      target = calc_pframe_target_size_one_pass_cbr(cpi);
-    }
-  }
-
-  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
-  // should be done here, before the frame qp is selected.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-    vp10_cyclic_refresh_update_parameters(cpi);
-
-  vp10_rc_set_frame_target(cpi, target);
-  rc->frames_till_gf_update_due = INT_MAX;
-  rc->baseline_gf_interval = INT_MAX;
-}
-
 void vp10_rc_get_one_pass_cbr_params(VP10_COMP *cpi) {
  VP10_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
@ -1712,7 +1580,7 @@ void vp10_rc_set_gf_interval_range(const VP10_COMP *const cpi,
    rc->max_gf_interval = rc->static_scene_max_gf_interval;

  // Clamp min to max
-  rc->min_gf_interval = MIN(rc->min_gf_interval, rc->max_gf_interval);
+  rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
 }

 void vp10_rc_update_framerate(VP10_COMP *cpi) {
@ -1725,7 +1593,8 @@ void vp10_rc_update_framerate(VP10_COMP *cpi) {
  rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
                                oxcf->two_pass_vbrmin_section / 100);

-  rc->min_frame_bandwidth = MAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+  rc->min_frame_bandwidth =
+      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);

  // A maximum bitrate for a frame is defined.
  // The baseline for this aligns with HW implementations that
@ -1736,8 +1605,8 @@ void vp10_rc_update_framerate(VP10_COMP *cpi) {
  // specifies lossless encode.
  vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth *
                     oxcf->two_pass_vbrmax_section) / 100);
-  rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
-                                    vbr_max_bits);
+  rc->max_frame_bandwidth =
+      VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);

  vp10_rc_set_gf_interval_range(cpi, rc);
 }
@ -1775,12 +1644,12 @@ static void vbr_rate_correction(VP10_COMP *cpi, int *this_frame_target) {
  // Dont do it for kf,arf,gf or overlay frames.
  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
      rc->vbr_bits_off_target_fast) {
-    int one_frame_bits = MAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
    int fast_extra_bits;
-    fast_extra_bits =
-      (int)MIN(rc->vbr_bits_off_target_fast, one_frame_bits);
-    fast_extra_bits = (int)MIN(fast_extra_bits,
-      MAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)VPXMIN(
+        fast_extra_bits,
+        VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
    *this_frame_target += (int)fast_extra_bits;
    rc->vbr_bits_off_target_fast -= fast_extra_bits;
  }
--- a/vp10/encoder/ratectrl.h
+++ b/vp10/encoder/ratectrl.h
@ -169,7 +169,6 @@ int vp10_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
 // First call per frame, one of:
 //   vp10_rc_get_one_pass_vbr_params()
 //   vp10_rc_get_one_pass_cbr_params()
-//   vp10_rc_get_svc_params()
 //   vp10_rc_get_first_pass_params()
 //   vp10_rc_get_second_pass_params()
 // depending on the usage to set the rate control encode parameters desired.
@ -190,7 +189,6 @@ int vp10_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
 // encode_frame_to_data_rate() function.
 void vp10_rc_get_one_pass_vbr_params(struct VP10_COMP *cpi);
 void vp10_rc_get_one_pass_cbr_params(struct VP10_COMP *cpi);
-void vp10_rc_get_svc_params(struct VP10_COMP *cpi);

 // Post encode update of the rate control parameters based
 // on bytes used
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@ -14,6 +14,7 @@

 #include "./vp10_rtcd.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/bitops.h"
 #include "vpx_ports/mem.h"
@ -177,7 +178,7 @@ int vp10_compute_rd_mult(const VP10_COMP *cpi, int qindex) {
  if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
-    const int boost_index = MIN(15, (cpi->rc.gfu_boost / 100));
+    const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));

    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
@ -209,7 +210,7 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
  q = vp10_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  // TODO(debargha): Adjust the function below.
-  return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+  return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }

 void vp10_initialize_me_consts(VP10_COMP *cpi, MACROBLOCK *x, int qindex) {
@ -290,8 +291,7 @@ void vp10_initialize_rd_consts(VP10_COMP *cpi) {
  set_block_thresholds(cm, rd);
  set_partition_probs(cm, xd);

-  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
-    fill_token_costs(x->token_costs, cm->fc->coef_probs);
+  fill_token_costs(x->token_costs, cm->fc->coef_probs);

  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
      cm->frame_type == KEY_FRAME) {
@ -300,20 +300,17 @@ void vp10_initialize_rd_consts(VP10_COMP *cpi) {
                      vp10_partition_tree);
  }

-  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
-      cm->frame_type == KEY_FRAME) {
-    fill_mode_costs(cpi);
+  fill_mode_costs(cpi);

-    if (!frame_is_intra_only(cm)) {
-      vp10_build_nmv_cost_table(x->nmvjointcost,
-                               cm->allow_high_precision_mv ? x->nmvcost_hp
-                                                           : x->nmvcost,
-                               &cm->fc->nmvc, cm->allow_high_precision_mv);
+  if (!frame_is_intra_only(cm)) {
+    vp10_build_nmv_cost_table(x->nmvjointcost,
+                             cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                         : x->nmvcost,
+                             &cm->fc->nmvc, cm->allow_high_precision_mv);

-      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-        vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
-                        cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
-    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
+                      cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
  }
 }

@ -409,7 +406,7 @@ void vp10_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
    static const uint32_t MAX_XSQ_Q10 = 245727;
    const uint64_t xsq_q10_64 =
        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
-    const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
+    const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
    model_rd_norm(xsq_q10, &r_q10, &d_q10);
    *rate = ((r_q10 << n_log2) + 2) >> 2;
    *dist = (var * (int64_t)d_q10 + 512) >> 10;
@ -490,7 +487,7 @@ void vp10_mv_pred(VP10_COMP *cpi, MACROBLOCK *x,
      continue;
    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
-    max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+    max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);

    if (fp_row ==0 && fp_col == 0 && zero_seen)
      continue;
@ -635,16 +632,15 @@ void vp10_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
    int mode;
    for (mode = 0; mode < top_mode; ++mode) {
-      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+      const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
      BLOCK_SIZE bs;
      for (bs = min_size; bs <= max_size; ++bs) {
        int *const fact = &factor_buf[bs][mode];
        if (mode == best_mode_index) {
          *fact -= (*fact >> 4);
        } else {
-          *fact = MIN(*fact + RD_THRESH_INC,
-                      rd_thresh * RD_THRESH_MAX_FACT);
+          *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
        }
      }
    }
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@ -14,6 +14,7 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@ -196,8 +197,8 @@ static void model_rd_for_sb(VP10_COMP *cpi, BLOCK_SIZE bsize,
    const int64_t ac_thr = p->quant_thred[1] >> shift;
    // The low thresholds are used to measure if the prediction errors are
    // low enough so that we can skip the mode search.
-    const int64_t low_dc_thr = MIN(50, dc_thr >> 2);
-    const int64_t low_ac_thr = MIN(80, ac_thr >> 2);
+    const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
+    const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
    int idx, idy;
@ -441,19 +442,6 @@ static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
                              &this_sse) >> shift;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  *out_sse = this_sse >> shift;
-
-  if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
-    // TODO(jingning): tune the model to better capture the distortion.
-    int64_t p = (pd->dequant[1] * pd->dequant[1] *
-                    (1 << ss_txfrm_size)) >>
-#if CONFIG_VP9_HIGHBITDEPTH
-                        (shift + 2 + (bd - 8) * 2);
-#else
-                        (shift + 2);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    *out_dist += (p >> 4);
-    *out_sse  += p;
-  }
 }

 static int rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
@ -509,7 +497,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
        if (tx_size != TX_32X32)
          dc_correct >>= 2;

-        dist = MAX(0, sse - dc_correct);
+        dist = VPXMAX(0, sse - dc_correct);
      }
    } else {
      // SKIP_TXFM_AC_DC
@ -535,7 +523,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
  rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);

  // TODO(jingning): temporarily enabled only for luma component
-  rd = MIN(rd1, rd2);
+  rd = VPXMIN(rd1, rd2);
  if (plane == 0)
    x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
                                    (rd1 > rd2 && !xd->lossless);
@ -603,7 +591,7 @@ static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;

-  mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
+  mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
 #if CONFIG_EXT_TX
  if (mbmi->ext_txfrm >= GET_EXT_TX_TYPES(mbmi->tx_size)) {
    *rate = INT_MAX;
@ -659,8 +647,8 @@ static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
    start_tx = max_tx_size;
    end_tx = 0;
  } else {
-    TX_SIZE chosen_tx_size = MIN(max_tx_size,
-                                 tx_mode_to_biggest_tx_size[cm->tx_mode]);
+    TX_SIZE chosen_tx_size = VPXMIN(max_tx_size,
+                                    tx_mode_to_biggest_tx_size[cm->tx_mode]);
    start_tx = chosen_tx_size;
    end_tx = chosen_tx_size;
  }
@ -832,9 +820,7 @@ static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
                                                                  p->src_diff);
          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
          xd->mi[0]->bmi[block].as_mode = mode;
-          vp10_predict_intra_block(xd, 1, TX_4X4, mode,
-                                  x->skip_encode ? src : dst,
-                                  x->skip_encode ? src_stride : dst_stride,
+          vp10_predict_intra_block(xd, 1, TX_4X4, mode, dst, dst_stride,
                                  dst, dst_stride,
                                  col + idx, row + idy, 0);
          vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
@ -897,7 +883,7 @@ static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
    next_highbd:
      {}
    }
-    if (best_rd >= rd_thresh || x->skip_encode)
+    if (best_rd >= rd_thresh)
      return best_rd;

    for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
@ -938,9 +924,7 @@ static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
            vp10_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
        xd->mi[0]->bmi[block].as_mode = mode;
-        vp10_predict_intra_block(xd, 1, TX_4X4, mode,
-                                x->skip_encode ? src : dst,
-                                x->skip_encode ? src_stride : dst_stride,
+        vp10_predict_intra_block(xd, 1, TX_4X4, mode, dst, dst_stride,
                                dst, dst_stride, col + idx, row + idy, 0);
        vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);

@ -996,7 +980,7 @@ static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
    {}
  }

-  if (best_rd >= rd_thresh || x->skip_encode)
+  if (best_rd >= rd_thresh)
    return best_rd;

  for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
@ -1096,16 +1080,6 @@ static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x,
  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
  /* Y Search for intra prediction mode */
  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-
-    if (cpi->sf.use_nonrd_pick_mode) {
-      // These speed features are turned on in hybrid non-RD and RD mode
-      // for key frame coding in the context of real-time setting.
-      if (conditional_skipintra(mode, mode_selected))
-          continue;
-      if (*skippable)
-        break;
-    }
-
    mic->mbmi.mode = mode;

    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
@ -1435,7 +1409,7 @@ static int64_t encode_inter_mb_segment(VP10_COMP *cpi,
                              cpi->sf.use_fast_coef_costing);
      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
-      rd = MIN(rd1, rd2);
+      rd = VPXMIN(rd1, rd2);
      if (rd >= best_yrd)
        return INT64_MAX;
    }
@ -1854,7 +1828,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
          if (i == 0)
            max_mv = x->max_mv_context[mbmi->ref_frame[0]];
          else
-            max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
+            max_mv =
+                VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;

          if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
            // Take wtd average of the step_params based on the last frame's
@ -1872,7 +1847,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
          if (cpi->sf.adaptive_motion_search) {
            mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
            mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
-            step_param = MAX(step_param, 8);
+            step_param = VPXMAX(step_param, 8);
          }

          // adjust src pointer for this block
@ -2277,7 +2252,7 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
  vp10_set_mv_search_range(x, &ref_mv);

  // Work out the size of the first step in the mv step search.
-  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
    // Take wtd average of the step_params based on the last frame's
    // max mv magnitude and that based on the best ref mvs of the current
@ -2289,9 +2264,10 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
  }

  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
-    int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] -
-          MIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
-    step_param = MAX(step_param, boffset);
+    int boffset =
+        2 * (b_width_log2_lookup[BLOCK_64X64] -
+             VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
  }

  if (cpi->sf.adaptive_motion_search) {
@ -2512,7 +2488,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
      // motion field, where the distortion gain for a single block may not
      // be enough to overcome the cost of a new mv.
      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
-        *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+        *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
      } else {
        *rate2 += rate_mv;
      }
@ -2549,10 +2525,10 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
  // initiation of a motion field.
  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
                          mode_mv, refs[0])) {
-    *rate2 += MIN(cost_mv_ref(cpi, this_mode,
-                              mbmi_ext->mode_context[refs[0]]),
-                  cost_mv_ref(cpi, NEARESTMV,
-                              mbmi_ext->mode_context[refs[0]]));
+    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode,
+                                 mbmi_ext->mode_context[refs[0]]),
+                     cost_mv_ref(cpi, NEARESTMV,
+                                 mbmi_ext->mode_context[refs[0]]));
  } else {
    *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
  }
@ -2594,10 +2570,10 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
          rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
          filter_cache[i] = rd;
          filter_cache[SWITCHABLE_FILTERS] =
-              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
          if (cm->interp_filter == SWITCHABLE)
            rd += rs_rd;
-          *mask_filter = MAX(*mask_filter, rd);
+          *mask_filter = VPXMAX(*mask_filter, rd);
        } else {
          int rate_sum = 0;
          int64_t dist_sum = 0;
@ -2627,10 +2603,10 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
          rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
          filter_cache[i] = rd;
          filter_cache[SWITCHABLE_FILTERS] =
-              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
          if (cm->interp_filter == SWITCHABLE)
            rd += rs_rd;
-          *mask_filter = MAX(*mask_filter, rd);
+          *mask_filter = VPXMAX(*mask_filter, rd);

          if (i == 0 && intpel_mv) {
            tmp_rate_sum = rate_sum;
@ -2745,7 +2721,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
        assert(rate_y_tx != INT_MAX);
        assert(rate_y_tx >= 0);
        rdcost_tx = RDCOST(x->rdmult, x->rddiv, rate_y_tx, distortion_y_tx);
-        rdcost_tx = MIN(rdcost_tx, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+        rdcost_tx = VPXMIN(rdcost_tx, RDCOST(x->rdmult, x->rddiv, 0, *psse));
        assert(rdcost_tx >= 0);
        if (rdcost_tx <
            (best_ext_tx == NORM ? ext_tx_th : 1) * best_rdcost_tx) {
@ -2773,7 +2749,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
    *distortion += distortion_y;

    rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
-    rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+    rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));

    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
                          &sseuv, bsize, ref_best_rd - rdcosty)) {
@ -2814,7 +2790,6 @@ void vp10_rd_pick_intra_mode_sb(VP10_COMP *cpi, MACROBLOCK *x,
  int y_skip = 0, uv_skip = 0;
  int64_t dist_y = 0, dist_uv = 0;
  TX_SIZE max_uv_tx_size;
-  x->skip_encode = 0;
  ctx->skip = 0;
  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
  xd->mi[0]->mbmi.ref_frame[1] = NONE;
@ -2838,7 +2813,7 @@ void vp10_rd_pick_intra_mode_sb(VP10_COMP *cpi, MACROBLOCK *x,
                                       pd[1].subsampling_x,
                                       pd[1].subsampling_y);
  rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip, MAX(BLOCK_8X8, bsize),
+                          &dist_uv, &uv_skip, VPXMAX(BLOCK_8X8, bsize),
                          max_uv_tx_size);

  if (y_skip && uv_skip) {
@ -2905,12 +2880,12 @@ static void rd_variance_adjustment(VP10_COMP *cpi,
  // to a predictor with a low spatial complexity compared to the source.
  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
      (source_variance > recon_variance)) {
-    var_factor = MIN(absvar_diff, MIN(VLOW_ADJ_MAX, var_error));
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
  // A second possible case of interest is where the source variance
  // is very low and we wish to discourage false texture or motion trails.
  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
             (recon_variance > source_variance)) {
-    var_factor = MIN(absvar_diff, MIN(VHIGH_ADJ_MAX, var_error));
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
  }
  *this_rd += (*this_rd * var_factor) / 100;
 }
@ -2940,7 +2915,7 @@ int vp10_active_h_edge(VP10_COMP *cpi, int mi_row, int mi_step) {
    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);

    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-    bottom_edge = MAX(top_edge, bottom_edge);
+    bottom_edge = VPXMAX(top_edge, bottom_edge);
  }

  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
@ -2967,7 +2942,7 @@ int vp10_active_v_edge(VP10_COMP *cpi, int mi_col, int mi_step) {
    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);

    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
-    right_edge = MAX(left_edge, right_edge);
+    right_edge = VPXMAX(left_edge, right_edge);
  }

  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
@ -2987,14 +2962,13 @@ int vp10_active_edge_sb(VP10_COMP *cpi,
 }

 void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
-                               TileDataEnc *tile_data,
-                               MACROBLOCK *x,
-                               int mi_row, int mi_col,
-                               RD_COST *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far) {
+                                TileDataEnc *tile_data,
+                                MACROBLOCK *x,
+                                int mi_row, int mi_col,
+                                RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx,
+                                int64_t best_rd_so_far) {
  VP10_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
  RD_OPT *const rd_opt = &cpi->rd;
  SPEED_FEATURES *const sf = &cpi->sf;
  MACROBLOCKD *const xd = &x->e_mbd;
@ -3045,8 +3019,6 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,

  vp10_zero(best_mbmode);

-  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
-
  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
    filter_cache[i] = INT64_MAX;

@ -3088,7 +3060,7 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
      // are masked out.
      ref_frame_skip_mask[0] |= (1 << ref_frame);
      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-    } else if (sf->reference_masking) {
+    } else {
      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
        // Skip fixed mv modes for poor references
        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
@ -3214,7 +3186,7 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
    }

    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
-        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
      continue;

    if (mode_skip_mask[ref_frame] & (1 << this_mode))
@ -3227,55 +3199,6 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
    if (best_rd < mode_threshold[mode_index])
      continue;

-    if (sf->motion_field_mode_search) {
-      const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
-                                tile_info->mi_col_end - mi_col);
-      const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
-                                tile_info->mi_row_end - mi_row);
-      const int bsl = mi_width_log2_lookup[bsize];
-      int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
-          + get_chessboard_index(cm->current_video_frame)) & 0x1;
-      MB_MODE_INFO *ref_mbmi;
-      int const_motion = 1;
-      int skip_ref_frame = !cb_partition_search_ctrl;
-      MV_REFERENCE_FRAME rf = NONE;
-      int_mv ref_mv;
-      ref_mv.as_int = INVALID_MV;
-
-      if ((mi_row - 1) >= tile_info->mi_row_start) {
-        ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
-        rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
-        for (i = 0; i < mi_width; ++i) {
-          ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
-          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
-                          (ref_frame == ref_mbmi->ref_frame[0]);
-          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
-        }
-      }
-
-      if ((mi_col - 1) >= tile_info->mi_col_start) {
-        if (ref_mv.as_int == INVALID_MV)
-          ref_mv = xd->mi[-1]->mbmi.mv[0];
-        if (rf == NONE)
-          rf = xd->mi[-1]->mbmi.ref_frame[0];
-        for (i = 0; i < mi_height; ++i) {
-          ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
-          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
-                          (ref_frame == ref_mbmi->ref_frame[0]);
-          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
-        }
-      }
-
-      if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
-        if (rf > INTRA_FRAME)
-          if (ref_frame != rf)
-            continue;
-
-      if (const_motion)
-        if (this_mode == NEARMV || this_mode == ZEROMV)
-          continue;
-    }
-
    comp_pred = second_ref_frame > INTRA_FRAME;
    if (comp_pred) {
      if (!cpi->allow_comp_inter_inter)
@ -3455,9 +3378,9 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,

    if (!disable_skip && ref_frame == INTRA_FRAME) {
      for (i = 0; i < REFERENCE_MODES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
    }

    // Did this mode help.. i.e. is it the new best mode
@ -3556,7 +3479,7 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
            adj_rd = filter_cache[i] - ref;

          adj_rd += this_rd;
-          best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+          best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
        }
      }
    }
@ -3694,8 +3617,6 @@ void vp10_rd_pick_inter_mode_sb_seg_skip(VP10_COMP *cpi,
  int rate2 = 0;
  const int64_t distortion2 = 0;

-  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
-
  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                           &comp_mode_p);

@ -3822,7 +3743,6 @@ void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
  mbmi->ext_txfrm = NORM;
 #endif

-  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
  memset(x->zcoeff_blk[TX_4X4], 0, 4);
  vp10_zero(best_mbmode);

@ -3902,7 +3822,7 @@ void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
    }

    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
-        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
      continue;

    // Test best rd so far against threshold for trying this mode.
@ -4060,12 +3980,11 @@ void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
            filter_cache[switchable_filter_index] = tmp_rd;
            filter_cache[SWITCHABLE_FILTERS] =
-                MIN(filter_cache[SWITCHABLE_FILTERS],
-                    tmp_rd + rs_rd);
+                VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
            if (cm->interp_filter == SWITCHABLE)
              tmp_rd += rs_rd;

-            mask_filter = MAX(mask_filter, tmp_rd);
+            mask_filter = VPXMAX(mask_filter, tmp_rd);

            newbest = (tmp_rd < tmp_best_rd);
            if (newbest) {
@ -4143,8 +4062,8 @@ void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
      compmode_cost = vp10_cost_bit(comp_mode_p, comp_pred);

      tmp_best_rdu = best_rd -
-          MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
-              RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+          VPXMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+                 RDCOST(x->rdmult, x->rddiv, 0, total_sse));

      if (tmp_best_rdu > 0) {
        // If even the 'Y' rd value of split is higher than best so far
@ -4204,9 +4123,9 @@ void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,

    if (!disable_skip && ref_frame == INTRA_FRAME) {
      for (i = 0; i < REFERENCE_MODES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
    }

    // Did this mode help.. i.e. is it the new best mode
@ -4305,7 +4224,7 @@ void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
          adj_rd = filter_cache[i] - ref;

        adj_rd += this_rd;
-        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
      }
    }

--- a/vp10/encoder/resize.h
+++ b/vp10/encoder/resize.h
@ -14,6 +14,10 @@
 #include <stdio.h>
 #include "vpx/vpx_integer.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp10_resize_plane(const uint8_t *const input,
                      int height,
                      int width,
@ -121,4 +125,9 @@ void vp10_highbd_resize_frame444(const uint8_t *const y,
                                int owidth,
                                int bd);
 #endif    // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif    // VP10_ENCODER_RESIZE_H_
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@ -14,11 +14,12 @@
 #include "vp10/encoder/speed_features.h"
 #include "vp10/encoder/rdopt.h"

+#include "vpx_dsp/vpx_dsp_common.h"

 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const VP10_COMP *cpi) {
-  return frame_is_kf_gf_arf(cpi) || vp10_is_upper_layer_key_frame(cpi);
+  return frame_is_kf_gf_arf(cpi);
 }

 // Sets a partition size down to which the auto partition code will always
@ -49,7 +50,7 @@ static void set_good_speed_feature_framesize_dependent(VP10_COMP *cpi,
  VP10_COMMON *const cm = &cpi->common;

  if (speed >= 1) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                              : DISABLE_ALL_INTER_SPLIT;
      sf->partition_search_breakout_dist_thr = (1 << 23);
@ -60,7 +61,7 @@ static void set_good_speed_feature_framesize_dependent(VP10_COMP *cpi,
  }

  if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                              : DISABLE_ALL_INTER_SPLIT;
      sf->adaptive_pred_interp_filter = 0;
@ -75,7 +76,7 @@ static void set_good_speed_feature_framesize_dependent(VP10_COMP *cpi,
  }

  if (speed >= 3) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask = DISABLE_ALL_SPLIT;
      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
      sf->partition_search_breakout_dist_thr = (1 << 25);
@ -99,7 +100,7 @@ static void set_good_speed_feature_framesize_dependent(VP10_COMP *cpi,
  }

  if (speed >= 4) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
      sf->partition_search_breakout_dist_thr = (1 << 26);
    } else {
      sf->partition_search_breakout_dist_thr = (1 << 24);
@ -147,9 +148,6 @@ static void set_good_speed_feature(VP10_COMP *cpi, VP10_COMMON *cm,
    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
                                                      : USE_LARGESTALL;

-    // Reference masking is not supported in dynamic scaling mode.
-    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
-
    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
                                 FLAG_SKIP_INTRA_DIRMISMATCH |
                                 FLAG_SKIP_INTRA_BESTINTER |
@ -191,7 +189,6 @@ static void set_good_speed_feature(VP10_COMP *cpi, VP10_COMMON *cm,
    sf->use_lp32x32fdct = 1;
    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
    sf->use_fast_coef_costing = 1;
-    sf->motion_field_mode_search = !boosted;
    sf->partition_search_breakout_rate_thr = 300;
  }

@ -215,7 +212,7 @@ static void set_rt_speed_feature_framesize_dependent(VP10_COMP *cpi,
  VP10_COMMON *const cm = &cpi->common;

  if (speed >= 1) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                              : DISABLE_ALL_INTER_SPLIT;
    } else {
@ -224,7 +221,7 @@ static void set_rt_speed_feature_framesize_dependent(VP10_COMP *cpi,
  }

  if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                              : DISABLE_ALL_INTER_SPLIT;
    } else {
@ -233,7 +230,7 @@ static void set_rt_speed_feature_framesize_dependent(VP10_COMP *cpi,
  }

  if (speed >= 5) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
      sf->partition_search_breakout_dist_thr = (1 << 25);
    } else {
      sf->partition_search_breakout_dist_thr = (1 << 23);
@ -241,7 +238,7 @@ static void set_rt_speed_feature_framesize_dependent(VP10_COMP *cpi,
  }

  if (speed >= 7) {
-    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
+    sf->encode_breakout_thresh = (VPXMIN(cm->width, cm->height) >= 720) ?
        800 : 300;
  }
 }
@ -279,14 +276,6 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
                                 FLAG_SKIP_COMP_BESTINTRA |
                                 FLAG_SKIP_INTRA_LOWVAR;
    sf->adaptive_pred_interp_filter = 2;
-
-    // Disable reference masking if using spatial scaling since
-    // pred_mv_sad will not be set (since vp10_mv_pred will not
-    // be called).
-    // TODO(marpan/agrange): Fix this condition.
-    sf->reference_masking = (cpi->oxcf.resize_mode != RESIZE_DYNAMIC &&
-                             cpi->svc.number_spatial_layers == 1) ? 1 : 0;
-
    sf->disable_filter_search_var_thresh = 50;
    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
@ -302,7 +291,6 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
    sf->use_square_partition_only = 1;
    sf->disable_filter_search_var_thresh = 100;
    sf->use_uv_intra_rd_estimate = 1;
-    sf->skip_encode_sb = 1;
    sf->mv.subpel_iters_per_step = 1;
    sf->adaptive_rd_thresh = 4;
    sf->mode_skip_start = 6;
@ -348,7 +336,6 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
    sf->max_delta_qindex = is_keyframe ? 20 : 15;
    sf->partition_search_type = REFERENCE_PARTITION;
-    sf->use_nonrd_pick_mode = 1;
    sf->allow_skip_recode = 0;
    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
@ -384,10 +371,8 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
    // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
    sf->partition_search_type = VAR_BASED_PARTITION;
    // Turn on this to use non-RD key frame coding mode.
-    sf->use_nonrd_pick_mode = 1;
    sf->mv.search_method = NSTEP;
    sf->mv.reduce_first_step_size = 1;
-    sf->skip_encode_sb = 0;
  }

  if (speed >= 7) {
@ -459,10 +444,8 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
  sf->adaptive_mode_search = 0;
  sf->cb_pred_filter_search = 0;
  sf->cb_partition_search = 0;
-  sf->motion_field_mode_search = 0;
  sf->alt_ref_search_fp = 0;
  sf->use_quant_fp = 0;
-  sf->reference_masking = 0;
  sf->partition_search_type = SEARCH_PARTITION;
  sf->less_rectangular_check = 0;
  sf->use_square_partition_only = 0;
@ -485,7 +468,6 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
    sf->intra_uv_mode_mask[i] = INTRA_ALL;
  }
  sf->use_rd_breakout = 0;
-  sf->skip_encode_sb = 0;
  sf->use_uv_intra_rd_estimate = 0;
  sf->allow_skip_recode = 0;
  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
@ -493,7 +475,6 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
  sf->use_fast_coef_costing = 0;
  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
  sf->schedule_mode_search = 0;
-  sf->use_nonrd_pick_mode = 0;
  for (i = 0; i < BLOCK_SIZES; ++i)
    sf->inter_mode_mask[i] = INTER_ALL;
  sf->max_intra_bsize = BLOCK_64X64;
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@ -141,7 +141,7 @@ typedef enum {
 } INTERP_FILTER_MASK;

 typedef enum {
-  // Search partitions using RD/NONRD criterion
+  // Search partitions using RD criterion
  SEARCH_PARTITION,

  // Always use a fixed size partition
@ -223,11 +223,6 @@ typedef struct SPEED_FEATURES {
  // mode to be evaluated. A high value means we will be faster.
  int adaptive_rd_thresh;

-  // Enables skipping the reconstruction step (idct, recon) in the
-  // intermediate steps assuming the last frame didn't have too many intra
-  // blocks and the q is less than a threshold.
-  int skip_encode_sb;
-  int skip_encode_frame;
  // Speed feature to allow or disallow skipping of recode at block
  // level within a frame.
  int allow_skip_recode;
@ -253,9 +248,6 @@ typedef struct SPEED_FEATURES {
  // of the best so far.
  int mode_skip_start;

-  // TODO(JBB): Remove this.
-  int reference_masking;
-
  PARTITION_SEARCH_TYPE partition_search_type;

  // Used if partition_search_type = FIXED_SIZE_PARTITION
@ -314,8 +306,6 @@ typedef struct SPEED_FEATURES {

  int cb_partition_search;

-  int motion_field_mode_search;
-
  int alt_ref_search_fp;

  // Fast quantization process path
@ -363,9 +353,6 @@ typedef struct SPEED_FEATURES {
  // by only looking at counts from 1/2 the bands.
  FAST_COEFF_UPDATE use_fast_coef_updates;

-  // This flag controls the use of non-RD mode decision.
-  int use_nonrd_pick_mode;
-
  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
  int inter_mode_mask[BLOCK_SIZES];
--- a/vp10/encoder/svc_layercontext.c
+++ b/vp10/encoder/svc_layercontext.c
@ -1,646 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vp10/encoder/encoder.h"
-#include "vp10/encoder/svc_layercontext.h"
-#include "vp10/encoder/extend.h"
-
-#define SMALL_FRAME_FB_IDX 7
-#define SMALL_FRAME_WIDTH  16
-#define SMALL_FRAME_HEIGHT 16
-
-void vp10_init_layer_context(VP10_COMP *const cpi) {
-  SVC *const svc = &cpi->svc;
-  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
-  int sl, tl;
-  int alt_ref_idx = svc->number_spatial_layers;
-
-  svc->spatial_layer_id = 0;
-  svc->temporal_layer_id = 0;
-
-  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
-    if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
-                                 SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
-                                 cpi->common.subsampling_x,
-                                 cpi->common.subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                 cpi->common.use_highbitdepth,
-#endif
-                                 VP9_ENC_BORDER_IN_PIXELS,
-                                 cpi->common.byte_alignment,
-                                 NULL, NULL, NULL))
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                         "Failed to allocate empty frame for multiple frame "
-                         "contexts");
-
-    memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
-           cpi->svc.empty_frame.img.buffer_alloc_sz);
-  }
-
-  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-    for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
-      int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
-      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-      RATE_CONTROL *const lrc = &lc->rc;
-      int i;
-      lc->current_video_frame_in_layer = 0;
-      lc->layer_size = 0;
-      lc->frames_from_key_frame = 0;
-      lc->last_frame_type = FRAME_TYPES;
-      lrc->ni_av_qi = oxcf->worst_allowed_q;
-      lrc->total_actual_bits = 0;
-      lrc->total_target_vs_actual = 0;
-      lrc->ni_tot_qi = 0;
-      lrc->tot_q = 0.0;
-      lrc->avg_q = 0.0;
-      lrc->ni_frames = 0;
-      lrc->decimation_count = 0;
-      lrc->decimation_factor = 0;
-
-      for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
-        lrc->rate_correction_factors[i] = 1.0;
-      }
-
-      if (cpi->oxcf.rc_mode == VPX_CBR) {
-        lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
-        lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
-        lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
-        lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
-      } else {
-        lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
-        lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
-        lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
-        lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
-                                            oxcf->best_allowed_q) / 2;
-        lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
-                                              oxcf->best_allowed_q) / 2;
-        if (oxcf->ss_enable_auto_arf[sl])
-          lc->alt_ref_idx = alt_ref_idx++;
-        else
-          lc->alt_ref_idx = INVALID_IDX;
-        lc->gold_ref_idx = INVALID_IDX;
-      }
-
-      lrc->buffer_level = oxcf->starting_buffer_level_ms *
-                              lc->target_bandwidth / 1000;
-      lrc->bits_off_target = lrc->buffer_level;
-    }
-  }
-
-  // Still have extra buffer for base layer golden frame
-  if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR)
-      && alt_ref_idx < REF_FRAMES)
-    svc->layer_context[0].gold_ref_idx = alt_ref_idx;
-}
-
-// Update the layer context from a change_config() call.
-void vp10_update_layer_context_change_config(VP10_COMP *const cpi,
-                                            const int target_bandwidth) {
-  SVC *const svc = &cpi->svc;
-  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
-  const RATE_CONTROL *const rc = &cpi->rc;
-  int sl, tl, layer = 0, spatial_layer_target;
-  float bitrate_alloc = 1.0;
-
-  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
-    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-      spatial_layer_target = 0;
-
-      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
-        layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
-        svc->layer_context[layer].target_bandwidth =
-            oxcf->layer_target_bitrate[layer];
-      }
-
-      layer = LAYER_IDS_TO_IDX(sl, ((oxcf->ts_number_layers - 1) < 0 ?
-          0 : (oxcf->ts_number_layers - 1)), oxcf->ts_number_layers);
-      spatial_layer_target =
-          svc->layer_context[layer].target_bandwidth =
-              oxcf->layer_target_bitrate[layer];
-
-      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
-        LAYER_CONTEXT *const lc =
-            &svc->layer_context[sl * oxcf->ts_number_layers + tl];
-        RATE_CONTROL *const lrc = &lc->rc;
-
-        lc->spatial_layer_target_bandwidth = spatial_layer_target;
-        bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target;
-        lrc->starting_buffer_level =
-            (int64_t)(rc->starting_buffer_level * bitrate_alloc);
-        lrc->optimal_buffer_level =
-            (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
-        lrc->maximum_buffer_size =
-            (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
-        lrc->bits_off_target =
-            MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-        lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
-        lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-        lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
-        lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
-        lrc->worst_quality = rc->worst_quality;
-        lrc->best_quality = rc->best_quality;
-      }
-    }
-  } else {
-    int layer_end;
-
-    if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
-      layer_end = svc->number_temporal_layers;
-    } else {
-      layer_end = svc->number_spatial_layers;
-    }
-
-    for (layer = 0; layer < layer_end; ++layer) {
-      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-      RATE_CONTROL *const lrc = &lc->rc;
-
-      lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
-
-      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
-      // Update buffer-related quantities.
-      lrc->starting_buffer_level =
-          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
-      lrc->optimal_buffer_level =
-          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
-      lrc->maximum_buffer_size =
-          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
-      lrc->bits_off_target = MIN(lrc->bits_off_target,
-                                 lrc->maximum_buffer_size);
-      lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
-      // Update framerate-related quantities.
-      if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
-        lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
-      } else {
-        lc->framerate = cpi->framerate;
-      }
-      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
-      lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
-      // Update qp-related quantities.
-      lrc->worst_quality = rc->worst_quality;
-      lrc->best_quality = rc->best_quality;
-    }
-  }
-}
-
-static LAYER_CONTEXT *get_layer_context(VP10_COMP *const cpi) {
-  if (is_one_pass_cbr_svc(cpi))
-    return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-        cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id];
-  else
-    return (cpi->svc.number_temporal_layers > 1 &&
-            cpi->oxcf.rc_mode == VPX_CBR) ?
-             &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
-             &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-}
-
-void vp10_update_temporal_layer_framerate(VP10_COMP *const cpi) {
-  SVC *const svc = &cpi->svc;
-  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(cpi);
-  RATE_CONTROL *const lrc = &lc->rc;
-  // Index into spatial+temporal arrays.
-  const int st_idx = svc->spatial_layer_id * svc->number_temporal_layers +
-      svc->temporal_layer_id;
-  const int tl = svc->temporal_layer_id;
-
-  lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
-  lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
-  // Update the average layer frame size (non-cumulative per-frame-bw).
-  if (tl == 0) {
-    lc->avg_frame_size = lrc->avg_frame_bandwidth;
-  } else {
-    const double prev_layer_framerate =
-        cpi->framerate / oxcf->ts_rate_decimator[tl - 1];
-    const int prev_layer_target_bandwidth =
-        oxcf->layer_target_bitrate[st_idx - 1];
-    lc->avg_frame_size =
-        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
-              (lc->framerate - prev_layer_framerate));
-  }
-}
-
-void vp10_update_spatial_layer_framerate(VP10_COMP *const cpi,
-                                         double framerate) {
-  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(cpi);
-  RATE_CONTROL *const lrc = &lc->rc;
-
-  lc->framerate = framerate;
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
-  lrc->min_frame_bandwidth = (int)(lrc->avg_frame_bandwidth *
-                                   oxcf->two_pass_vbrmin_section / 100);
-  lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
-                                   oxcf->two_pass_vbrmax_section) / 100);
-  vp10_rc_set_gf_interval_range(cpi, lrc);
-}
-
-void vp10_restore_layer_context(VP10_COMP *const cpi) {
-  LAYER_CONTEXT *const lc = get_layer_context(cpi);
-  const int old_frame_since_key = cpi->rc.frames_since_key;
-  const int old_frame_to_key = cpi->rc.frames_to_key;
-
-  cpi->rc = lc->rc;
-  cpi->twopass = lc->twopass;
-  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
-  cpi->alt_ref_source = lc->alt_ref_source;
-  // Reset the frames_since_key and frames_to_key counters to their values
-  // before the layer restore. Keep these defined for the stream (not layer).
-  if (cpi->svc.number_temporal_layers > 1) {
-    cpi->rc.frames_since_key = old_frame_since_key;
-    cpi->rc.frames_to_key = old_frame_to_key;
-  }
-}
-
-void vp10_save_layer_context(VP10_COMP *const cpi) {
-  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(cpi);
-
-  lc->rc = cpi->rc;
-  lc->twopass = cpi->twopass;
-  lc->target_bandwidth = (int)oxcf->target_bandwidth;
-  lc->alt_ref_source = cpi->alt_ref_source;
-}
-
-void vp10_init_second_pass_spatial_svc(VP10_COMP *cpi) {
-  SVC *const svc = &cpi->svc;
-  int i;
-
-  for (i = 0; i < svc->number_spatial_layers; ++i) {
-    TWO_PASS *const twopass = &svc->layer_context[i].twopass;
-
-    svc->spatial_layer_id = i;
-    vp10_init_second_pass(cpi);
-
-    twopass->total_stats.spatial_layer_id = i;
-    twopass->total_left_stats.spatial_layer_id = i;
-  }
-  svc->spatial_layer_id = 0;
-}
-
-void vp10_inc_frame_in_layer(VP10_COMP *const cpi) {
-  LAYER_CONTEXT *const lc =
-      &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-                              cpi->svc.number_temporal_layers];
-  ++lc->current_video_frame_in_layer;
-  ++lc->frames_from_key_frame;
-}
-
-int vp10_is_upper_layer_key_frame(const VP10_COMP *const cpi) {
-  return is_two_pass_svc(cpi) &&
-         cpi->svc.spatial_layer_id > 0 &&
-         cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-                                cpi->svc.number_temporal_layers +
-                                cpi->svc.temporal_layer_id].is_key_frame;
-}
-
-static void get_layer_resolution(const int width_org, const int height_org,
-                                 const int num, const int den,
-                                 int *width_out, int *height_out) {
-  int w, h;
-
-  if (width_out == NULL || height_out == NULL || den == 0)
-    return;
-
-  w = width_org * num / den;
-  h = height_org * num / den;
-
-  // make height and width even to make chrome player happy
-  w += w % 2;
-  h += h % 2;
-
-  *width_out = w;
-  *height_out = h;
-}
-
-// The function sets proper ref_frame_flags, buffer indices, and buffer update
-// variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
-// scheme.
-static void set_flags_and_fb_idx_for_temporal_mode3(VP10_COMP *const cpi) {
-  int frame_num_within_temporal_struct = 0;
-  int spatial_id, temporal_id;
-  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-  frame_num_within_temporal_struct =
-      cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-      cpi->svc.number_temporal_layers].current_video_frame_in_layer % 4;
-  temporal_id = cpi->svc.temporal_layer_id =
-      (frame_num_within_temporal_struct & 1) ? 2 :
-      (frame_num_within_temporal_struct >> 1);
-  cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
-      cpi->ext_refresh_alt_ref_frame = 0;
-  if (!temporal_id) {
-    cpi->ext_refresh_frame_flags_pending = 1;
-    cpi->ext_refresh_last_frame = 1;
-    if (!spatial_id) {
-      cpi->ref_frame_flags = VP9_LAST_FLAG;
-    } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
-      // base layer is a key frame.
-      cpi->ref_frame_flags = VP9_GOLD_FLAG;
-    } else {
-      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-    }
-  } else if (temporal_id == 1) {
-    cpi->ext_refresh_frame_flags_pending = 1;
-    cpi->ext_refresh_alt_ref_frame = 1;
-    if (!spatial_id) {
-      cpi->ref_frame_flags = VP9_LAST_FLAG;
-    } else {
-      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-    }
-  } else {
-    if (frame_num_within_temporal_struct == 1) {
-      // the first tl2 picture
-      if (!spatial_id) {
-        cpi->ext_refresh_frame_flags_pending = 1;
-        cpi->ext_refresh_alt_ref_frame = 1;
-        cpi->ref_frame_flags = VP9_LAST_FLAG;
-      } else if (spatial_id < cpi->svc.number_spatial_layers - 1) {
-        cpi->ext_refresh_frame_flags_pending = 1;
-        cpi->ext_refresh_alt_ref_frame = 1;
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-      } else {  // Top layer
-        cpi->ext_refresh_frame_flags_pending = 0;
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-      }
-    } else {
-      //  The second tl2 picture
-      if (!spatial_id) {
-        cpi->ext_refresh_frame_flags_pending = 1;
-        cpi->ref_frame_flags = VP9_LAST_FLAG;
-        cpi->ext_refresh_last_frame = 1;
-      } else if (spatial_id < cpi->svc.number_spatial_layers - 1) {
-        cpi->ext_refresh_frame_flags_pending = 1;
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-        cpi->ext_refresh_last_frame = 1;
-      } else {  // top layer
-        cpi->ext_refresh_frame_flags_pending = 0;
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-      }
-    }
-  }
-  if (temporal_id == 0) {
-    cpi->lst_fb_idx = spatial_id;
-    if (spatial_id)
-      cpi->gld_fb_idx = spatial_id - 1;
-    else
-      cpi->gld_fb_idx = 0;
-    cpi->alt_fb_idx = 0;
-  } else if (temporal_id == 1) {
-    cpi->lst_fb_idx = spatial_id;
-    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
-    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
-  } else if (frame_num_within_temporal_struct == 1) {
-    cpi->lst_fb_idx = spatial_id;
-    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
-    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
-  } else {
-    cpi->lst_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
-    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
-    cpi->alt_fb_idx = 0;
-  }
-}
-
-// The function sets proper ref_frame_flags, buffer indices, and buffer update
-// variables for temporal layering mode 2 - that does 0-1-0-1 temporal layering
-// scheme.
-static void set_flags_and_fb_idx_for_temporal_mode2(VP10_COMP *const cpi) {
-  int spatial_id, temporal_id;
-  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-  temporal_id = cpi->svc.temporal_layer_id =
-      cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-      cpi->svc.number_temporal_layers].current_video_frame_in_layer & 1;
-  cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
-                                cpi->ext_refresh_alt_ref_frame = 0;
-  if (!temporal_id) {
-    cpi->ext_refresh_frame_flags_pending = 1;
-    cpi->ext_refresh_last_frame = 1;
-    if (!spatial_id) {
-      cpi->ref_frame_flags = VP9_LAST_FLAG;
-    } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
-      // base layer is a key frame.
-      cpi->ref_frame_flags = VP9_GOLD_FLAG;
-    } else {
-      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-    }
-  } else if (temporal_id == 1) {
-    cpi->ext_refresh_frame_flags_pending = 1;
-    cpi->ext_refresh_alt_ref_frame = 1;
-    if (!spatial_id) {
-      cpi->ref_frame_flags = VP9_LAST_FLAG;
-    } else {
-      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-    }
-  }
-
-  if (temporal_id == 0) {
-    cpi->lst_fb_idx = spatial_id;
-    if (spatial_id)
-      cpi->gld_fb_idx = spatial_id - 1;
-    else
-      cpi->gld_fb_idx = 0;
-    cpi->alt_fb_idx = 0;
-  } else if (temporal_id == 1) {
-    cpi->lst_fb_idx = spatial_id;
-    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
-    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
-  }
-}
-
-// The function sets proper ref_frame_flags, buffer indices, and buffer update
-// variables for temporal layering mode 0 - that has no temporal layering.
-static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
-    VP10_COMP *const cpi) {
-  int spatial_id;
-  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-  cpi->ext_refresh_last_frame =
-      cpi->ext_refresh_golden_frame = cpi->ext_refresh_alt_ref_frame = 0;
-  cpi->ext_refresh_frame_flags_pending = 1;
-  cpi->ext_refresh_last_frame = 1;
-  if (!spatial_id) {
-    cpi->ref_frame_flags = VP9_LAST_FLAG;
-  } else if (cpi->svc.layer_context[0].is_key_frame) {
-    cpi->ref_frame_flags = VP9_GOLD_FLAG;
-  } else {
-    cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-  }
-  cpi->lst_fb_idx = spatial_id;
-  if (spatial_id)
-    cpi->gld_fb_idx = spatial_id - 1;
-  else
-    cpi->gld_fb_idx = 0;
-}
-
-int vp10_one_pass_cbr_svc_start_layer(VP10_COMP *const cpi) {
-  int width = 0, height = 0;
-  LAYER_CONTEXT *lc = NULL;
-
-  if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
-    set_flags_and_fb_idx_for_temporal_mode3(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-           VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
-    set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-           VP9E_TEMPORAL_LAYERING_MODE_0101) {
-    set_flags_and_fb_idx_for_temporal_mode2(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-      VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-    // VP9E_TEMPORAL_LAYERING_MODE_BYPASS :
-    // if the code goes here, it means the encoder will be relying on the
-    // flags from outside for layering.
-    // However, since when spatial+temporal layering is used, the buffer indices
-    // cannot be derived automatically, the bypass mode will only work when the
-    // number of spatial layers equals 1.
-    assert(cpi->svc.number_spatial_layers == 1);
-  }
-
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-                               cpi->svc.number_temporal_layers +
-                               cpi->svc.temporal_layer_id];
-
-  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
-                       lc->scaling_factor_num, lc->scaling_factor_den,
-                       &width, &height);
-
-  if (vp10_set_size_literal(cpi, width, height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
-
-  return 0;
-}
-
-#if CONFIG_SPATIAL_SVC
-int vp10_svc_start_frame(VP10_COMP *const cpi) {
-  int width = 0, height = 0;
-  LAYER_CONTEXT *lc;
-  struct lookahead_entry *buf;
-  int count = 1 << (cpi->svc.number_temporal_layers - 1);
-
-  cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-
-  cpi->svc.temporal_layer_id = 0;
-  while ((lc->current_video_frame_in_layer % count) != 0) {
-    ++cpi->svc.temporal_layer_id;
-    count >>= 1;
-  }
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
-
-  if (cpi->svc.spatial_layer_id == 0)
-    cpi->gld_fb_idx = (lc->gold_ref_idx >= 0) ?
-                      lc->gold_ref_idx : cpi->lst_fb_idx;
-  else
-    cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
-
-  if (lc->current_video_frame_in_layer == 0) {
-    if (cpi->svc.spatial_layer_id >= 2) {
-      cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
-    } else {
-      cpi->alt_fb_idx = cpi->lst_fb_idx;
-      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
-    }
-  } else {
-    if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
-      cpi->alt_fb_idx = lc->alt_ref_idx;
-      if (!lc->has_alt_frame)
-        cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
-    } else {
-      // Find a proper alt_fb_idx for layers that don't have alt ref frame
-      if (cpi->svc.spatial_layer_id == 0) {
-        cpi->alt_fb_idx = cpi->lst_fb_idx;
-      } else {
-        LAYER_CONTEXT *lc_lower =
-            &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
-
-        if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
-            lc_lower->alt_ref_source != NULL)
-          cpi->alt_fb_idx = lc_lower->alt_ref_idx;
-        else if (cpi->svc.spatial_layer_id >= 2)
-          cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
-        else
-          cpi->alt_fb_idx = cpi->lst_fb_idx;
-      }
-    }
-  }
-
-  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
-                       lc->scaling_factor_num, lc->scaling_factor_den,
-                       &width, &height);
-
-  // Workaround for multiple frame contexts. In some frames we can't use prev_mi
-  // since its previous frame could be changed during decoding time. The idea is
-  // we put a empty invisible frame in front of them, then we will not use
-  // prev_mi when encoding these frames.
-
-  buf = vp10_lookahead_peek(cpi->lookahead, 0);
-  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
-      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
-      lc->rc.frames_to_key != 0 &&
-      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
-    if ((cpi->svc.number_temporal_layers > 1 &&
-         cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
-        (cpi->svc.number_spatial_layers > 1 &&
-         cpi->svc.spatial_layer_id == 0)) {
-      struct lookahead_entry *buf = vp10_lookahead_peek(cpi->lookahead, 0);
-
-      if (buf != NULL) {
-        cpi->svc.empty_frame.ts_start = buf->ts_start;
-        cpi->svc.empty_frame.ts_end = buf->ts_end;
-        cpi->svc.encode_empty_frame_state = ENCODING;
-        cpi->common.show_frame = 0;
-        cpi->ref_frame_flags = 0;
-        cpi->common.frame_type = INTER_FRAME;
-        cpi->lst_fb_idx =
-            cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX;
-
-        if (cpi->svc.encode_intra_empty_frame != 0)
-          cpi->common.intra_only = 1;
-
-        width = SMALL_FRAME_WIDTH;
-        height = SMALL_FRAME_HEIGHT;
-      }
-    }
-  }
-
-  cpi->oxcf.worst_allowed_q = vp10_quantizer_to_qindex(lc->max_q);
-  cpi->oxcf.best_allowed_q = vp10_quantizer_to_qindex(lc->min_q);
-
-  vp10_change_config(cpi, &cpi->oxcf);
-
-  if (vp10_set_size_literal(cpi, width, height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
-
-  vp10_set_high_precision_mv(cpi, 1);
-
-  cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
-
-  return 0;
-}
-
-#endif
-
-struct lookahead_entry *vp10_svc_lookahead_pop(VP10_COMP *const cpi,
-                                              struct lookahead_ctx *ctx,
-                                              int drain) {
-  struct lookahead_entry *buf = NULL;
-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
-    buf = vp10_lookahead_peek(ctx, 0);
-    if (buf != NULL) {
-      // Only remove the buffer when pop the highest layer.
-      if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
-        vp10_lookahead_pop(ctx, drain);
-      }
-    }
-  }
-  return buf;
-}
--- a/vp10/encoder/svc_layercontext.h
+++ b/vp10/encoder/svc_layercontext.h
@ -1,122 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP10_ENCODER_SVC_LAYERCONTEXT_H_
-#define VP10_ENCODER_SVC_LAYERCONTEXT_H_
-
-#include "vpx/vpx_encoder.h"
-
-#include "vp10/encoder/ratectrl.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-  RATE_CONTROL rc;
-  int target_bandwidth;
-  int spatial_layer_target_bandwidth;  // Target for the spatial layer.
-  double framerate;
-  int avg_frame_size;
-  int max_q;
-  int min_q;
-  int scaling_factor_num;
-  int scaling_factor_den;
-  TWO_PASS twopass;
-  vpx_fixed_buf_t rc_twopass_stats_in;
-  unsigned int current_video_frame_in_layer;
-  int is_key_frame;
-  int frames_from_key_frame;
-  FRAME_TYPE last_frame_type;
-  struct lookahead_entry  *alt_ref_source;
-  int alt_ref_idx;
-  int gold_ref_idx;
-  int has_alt_frame;
-  size_t layer_size;
-  struct vpx_psnr_pkt psnr_pkt;
-} LAYER_CONTEXT;
-
-typedef struct {
-  int spatial_layer_id;
-  int temporal_layer_id;
-  int number_spatial_layers;
-  int number_temporal_layers;
-
-  int spatial_layer_to_encode;
-
-  // Workaround for multiple frame contexts
-  enum {
-    ENCODED = 0,
-    ENCODING,
-    NEED_TO_ENCODE
-  }encode_empty_frame_state;
-  struct lookahead_entry empty_frame;
-  int encode_intra_empty_frame;
-
-  // Store scaled source frames to be used for temporal filter to generate
-  // a alt ref frame.
-  YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
-
-  // Layer context used for rate control in one pass temporal CBR mode or
-  // two pass spatial mode.
-  LAYER_CONTEXT layer_context[VPX_MAX_LAYERS];
-  // Indicates what sort of temporal layering is used.
-  // Currently, this only works for CBR mode.
-  VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
-} SVC;
-
-struct VP10_COMP;
-
-// Initialize layer context data from init_config().
-void vp10_init_layer_context(struct VP10_COMP *const cpi);
-
-// Update the layer context from a change_config() call.
-void vp10_update_layer_context_change_config(struct VP10_COMP *const cpi,
-                                            const int target_bandwidth);
-
-// Prior to encoding the frame, update framerate-related quantities
-// for the current temporal layer.
-void vp10_update_temporal_layer_framerate(struct VP10_COMP *const cpi);
-
-// Update framerate-related quantities for the current spatial layer.
-void vp10_update_spatial_layer_framerate(struct VP10_COMP *const cpi,
-                                        double framerate);
-
-// Prior to encoding the frame, set the layer context, for the current layer
-// to be encoded, to the cpi struct.
-void vp10_restore_layer_context(struct VP10_COMP *const cpi);
-
-// Save the layer context after encoding the frame.
-void vp10_save_layer_context(struct VP10_COMP *const cpi);
-
-// Initialize second pass rc for spatial svc.
-void vp10_init_second_pass_spatial_svc(struct VP10_COMP *cpi);
-
-// Increment number of video frames in layer
-void vp10_inc_frame_in_layer(struct VP10_COMP *const cpi);
-
-// Check if current layer is key frame in spatial upper layer
-int vp10_is_upper_layer_key_frame(const struct VP10_COMP *const cpi);
-
-// Get the next source buffer to encode
-struct lookahead_entry *vp10_svc_lookahead_pop(struct VP10_COMP *const cpi,
-                                              struct lookahead_ctx *ctx,
-                                              int drain);
-
-// Start a frame and initialize svc parameters
-int vp10_svc_start_frame(struct VP10_COMP *const cpi);
-
-int vp10_one_pass_cbr_svc_start_layer(struct VP10_COMP *const cpi);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP10_ENCODER_SVC_LAYERCONTEXT_
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c
@ -23,6 +23,7 @@
 #include "vp10/encoder/ratectrl.h"
 #include "vp10/encoder/segmentation.h"
 #include "vp10/encoder/temporal_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
@ -242,7 +243,7 @@ static int temporal_filter_find_matching_mb_c(VP10_COMP *cpi,
  xd->plane[0].pre[0].stride = stride;

  step_param = mv_sf->reduce_first_step_size;
-  step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2);
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);

  // Ignore mv costing by sending NULL pointer instead of cost arrays
  vp10_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
@ -652,9 +653,7 @@ static void adjust_arnr_filter(VP10_COMP *cpi,
 }

 void vp10_temporal_filter(VP10_COMP *cpi, int distance) {
-  VP10_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  int frame;
  int frames_to_blur;
  int start_frame;
@ -681,67 +680,21 @@ void vp10_temporal_filter(VP10_COMP *cpi, int distance) {
  if (frames_to_blur > 0) {
    // Setup scaling factors. Scaling on each of the arnr frames is not
    // supported.
-    if (cpi->use_svc) {
-      // In spatial svc the scaling factors might be less then 1/2.
-      // So we will use non-normative scaling.
-      int frame_used = 0;
+    // ARF is produced at the native frame size and resized when coded.
 #if CONFIG_VP9_HIGHBITDEPTH
-      vp10_setup_scale_factors_for_frame(
-          &sf,
-          get_frame_new_buffer(cm)->y_crop_width,
-          get_frame_new_buffer(cm)->y_crop_height,
-          get_frame_new_buffer(cm)->y_crop_width,
-          get_frame_new_buffer(cm)->y_crop_height,
-          cm->use_highbitdepth);
+    vp10_setup_scale_factors_for_frame(&sf,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height,
+                                      cpi->common.use_highbitdepth);
 #else
-      vp10_setup_scale_factors_for_frame(
-          &sf,
-          get_frame_new_buffer(cm)->y_crop_width,
-          get_frame_new_buffer(cm)->y_crop_height,
-          get_frame_new_buffer(cm)->y_crop_width,
-          get_frame_new_buffer(cm)->y_crop_height);
+    vp10_setup_scale_factors_for_frame(&sf,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-      for (frame = 0; frame < frames_to_blur; ++frame) {
-        if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
-            cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
-          if (vpx_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
-                                       cm->width, cm->height,
-                                       cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                       cm->use_highbitdepth,
-#endif
-                                       VP9_ENC_BORDER_IN_PIXELS,
-                                       cm->byte_alignment,
-                                       NULL, NULL, NULL)) {
-            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                               "Failed to reallocate alt_ref_buffer");
-          }
-          frames[frame] = vp10_scale_if_required(
-              cm, frames[frame], &cpi->svc.scaled_frames[frame_used]);
-          ++frame_used;
-        }
-      }
-      cm->mi = cm->mip + cm->mi_stride + 1;
-      xd->mi = cm->mi_grid_visible;
-      xd->mi[0] = cm->mi;
-    } else {
-      // ARF is produced at the native frame size and resized when coded.
-#if CONFIG_VP9_HIGHBITDEPTH
-      vp10_setup_scale_factors_for_frame(&sf,
-                                        frames[0]->y_crop_width,
-                                        frames[0]->y_crop_height,
-                                        frames[0]->y_crop_width,
-                                        frames[0]->y_crop_height,
-                                        cm->use_highbitdepth);
-#else
-      vp10_setup_scale_factors_for_frame(&sf,
-                                        frames[0]->y_crop_width,
-                                        frames[0]->y_crop_height,
-                                        frames[0]->y_crop_width,
-                                        frames[0]->y_crop_height);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
  }

  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@ -19,7 +19,6 @@ VP10_COMMON_SRCS-yes += common/entropymode.c
 VP10_COMMON_SRCS-yes += common/entropymv.c
 VP10_COMMON_SRCS-yes += common/frame_buffers.c
 VP10_COMMON_SRCS-yes += common/frame_buffers.h
-VP10_COMMON_SRCS-yes += common/idct.c
 VP10_COMMON_SRCS-yes += common/alloccommon.h
 VP10_COMMON_SRCS-yes += common/blockd.h
 VP10_COMMON_SRCS-yes += common/common.h
@ -30,6 +29,9 @@ VP10_COMMON_SRCS-yes += common/enums.h
 VP10_COMMON_SRCS-yes += common/filter.h
 VP10_COMMON_SRCS-yes += common/filter.c
 VP10_COMMON_SRCS-yes += common/idct.h
+VP10_COMMON_SRCS-yes += common/idct.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.c
 VP10_COMMON_SRCS-yes += common/loopfilter.h
 VP10_COMMON_SRCS-yes += common/thread_common.h
 VP10_COMMON_SRCS-yes += common/mv.h
@ -59,6 +61,8 @@ VP10_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP10_COMMON_SRCS-yes += common/common_data.h
 VP10_COMMON_SRCS-yes += common/scan.c
 VP10_COMMON_SRCS-yes += common/scan.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c

 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.h
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.c
@ -85,10 +89,16 @@ VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif

 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_impl_sse2.h

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
 VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
 endif

+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.h
+
 $(eval $(call rtcd_h_template,vp10_rtcd,vp10/common/vp10_rtcd_defs.pl))
--- a/Show More
+++ b/Show More