Merge branch 'experimental' of review:webm/libvpx

Change-Id: Ib2c2236349c2ae8ee81bd01c5067dddcbac713ca
2013-01-14 16:25:26 -08:00 · 2013-01-14 16:25:26 -08:00 · de5546c372
commit de5546c372
parent 652589d56c 9bf73f46f9
156 changed files with 14735 additions and 11657 deletions
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@ -436,10 +436,10 @@ RTCD_OPTIONS = ${RTCD_OPTIONS}
 EOF

    if enabled rvct; then cat >> $1 << EOF
-fmt_deps = sed -e 's;^__image.axf;\$\${@:.d=.o} \$\$@;' #hide
+fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide
 EOF
    else cat >> $1 << EOF
-fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\$\${@:.d=.o} \$\$@;'
+fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
 EOF
    fi

--- a/10
+++ b/10
@ -239,15 +239,17 @@ HAVE_LIST="
 "
 EXPERIMENT_LIST="
    csm
-    comp_intra_pred
-    superblocks
-    pred_filter
    lossless
-    subpelrefmv
    new_mvref
    implicit_segmentation
    newbintramodes
    comp_interintra_pred
+    tx64x64
+    dwtdcthybrid
+    cnvcontext
+    newcoefcontext
+    enable_6tap
+    abovesprefmv
 "
 CONFIG_LIST="
    external_build
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+extern "C" {
+#include "vp9/common/vp9_entropy.h"
+#include "./vp9_rtcd.h"
+  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
+  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+}
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#ifdef _MSC_VER
+static int round(double x) {
+  if (x < 0)
+    return (int)ceil(x - 0.5);
+  else
+    return (int)floor(x + 0.5);
+}
+#endif
+
+#if !CONFIG_DWTDCTHYBRID
+static const double kPi = 3.141592653589793238462643383279502884;
+static void reference2_32x32_idct_2d(double *input, double *output) {
+  double x;
+  for (int l = 0; l < 32; ++l) {
+    for (int k = 0; k < 32; ++k) {
+      double s = 0;
+      for (int i = 0; i < 32; ++i) {
+        for (int j = 0; j < 32; ++j) {
+          x = cos(kPi * j * (l + 0.5) / 32.0) *
+              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
+          if (i != 0)
+            x *= sqrt(2.0);
+          if (j != 0)
+            x *= sqrt(2.0);
+          s += x;
+        }
+      }
+      output[k * 32 + l] = s / 4;
+    }
+  }
+}
+
+static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 32; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 32; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+  // First transform columns
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = input[j*32 + i];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    for (int j = 0; j < 32; ++j)
+      output[j * 32 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i*32];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    // Scale by some magic number
+    for (int j = 0; j < 32; ++j)
+      output[j + i * 32] = temp_out[j] / 4;
+  }
+}
+
+
+TEST(VP9Idct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t in[1024], coeff[1024];
+    int16_t out_c[1024];
+    double out_r[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      in[j] = rnd.Rand8() - rnd.Rand8();
+
+    reference_32x32_dct_2d(in, out_r);
+    for (int j = 0; j < 1024; j++)
+      coeff[j] = round(out_r[j]);
+    vp9_short_idct32x32_c(coeff, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const int diff = out_c[j] - in[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 3x32 IDCT has error " << error
+          << " at index " << j;
+    }
+
+    vp9_short_fdct32x32_c(in, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const double diff = coeff[j] - out_c[j];
+      const double error = diff * diff;
+      EXPECT_GE(1.0, error)
+          << "Error: 32x32 FDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+#else  // CONFIG_DWTDCTHYBRID
+  // TODO(rbultje/debargha): add DWT-specific tests
+#endif  // CONFIG_DWTDCTHYBRID
+TEST(VP9Fdct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  unsigned int max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[1024];
+    int16_t test_temp_block[1024];
+    int16_t test_output_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+
+    for (int j = 0; j < 1024; ++j) {
+      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block/10, total_error)
+      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1/10 per block";
+}
+
+TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[1024], input_extreme_block[1024];
+    int16_t output_block[1024], output_extreme_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 1024; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_block, pitch);
+    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 1024; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 32x32 FDCT extreme has coefficient larger than "
+             "4*DCT_MAX_VALUE";
+    }
+  }
+}
+}  // namespace
--- a/test/test.mk
+++ b/test/test.mk
@ -69,6 +69,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 endif # VP9


--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@ -15,7 +15,7 @@
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
 #include "common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "systemdependent.h"

 #include <limits.h>
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@ -21,7 +21,7 @@
 #include "vp8/common/alloccommon.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/quant_common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/setupintrarecon.h"

 #include "decodemv.h"
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@ -26,7 +26,7 @@

 #include "vp8/common/quant_common.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@ -21,7 +21,7 @@
 #include "vp8/common/systemdependent.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "encodemb.h"
 #include "vp8/common/extend.h"
 #include "vpx_mem/vpx_mem.h"
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -20,7 +20,7 @@
 #include "mcomp.h"
 #include "firstpass.h"
 #include "psnr.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
@ -2588,7 +2588,7 @@ static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
        Scale2Ratio(cm->horiz_scale, &hr, &hs);
        Scale2Ratio(cm->vert_scale, &vr, &vs);

-        vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
+        vpx_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
                        tmp_height, hs, hr, vs, vr, 0);

        vp8_yv12_extend_frame_borders(&cpi->scaled_source);
@ -3466,7 +3466,7 @@ static void encode_frame_to_data_rate
        /* Note that we should not throw out a key frame (especially when
         * spatial resampling is enabled).
         */
-        if ((cm->frame_type == KEY_FRAME))
+        if (cm->frame_type == KEY_FRAME)
        {
            cpi->decimation_count = cpi->decimation_factor;
        }
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@ -14,7 +14,7 @@
 #include "onyx_int.h"
 #include "quantize.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/alloccommon.h"
 #include "vp8/common/loopfilter.h"
 #if ARCH_ARM
--- a/vp8/encoder/psnr.c
+++ b/vp8/encoder/psnr.c
@ -13,7 +13,7 @@
 #include "math.h"
 #include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */

-#define MAX_PSNR 60
+#define MAX_PSNR 100

 double vp8_mse2psnr(double Samples, double Peak, double Mse)
 {
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@ -17,7 +17,7 @@
 #include "mcomp.h"
 #include "firstpass.h"
 #include "psnr.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@ -1178,7 +1178,9 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
    {
        int res;
        vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data ;
-        res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, scalemode.v_scaling_mode);
+        res = vp8_set_internal_size(ctx->cpi,
+                                    (VPX_SCALING)scalemode.h_scaling_mode,
+                                    (VPX_SCALING)scalemode.v_scaling_mode);

        if (!res)
        {
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@ -220,4 +220,8 @@ void vp9_initialize_common() {
  vp9_entropy_mode_init();

  vp9_entropy_mv_init();
+
+#if CONFIG_NEWCOEFCONTEXT
+  vp9_init_neighbors();
+#endif
 }
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@ -23,4 +23,4 @@ void vp9_setup_version(VP9_COMMON *oci);
 void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);
 void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);

-#endif
+#endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@ -12,18 +12,15 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"

-
-const unsigned char vp9_block2left[25] = {
-  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25] = {
+  {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}
 };
-const unsigned char vp9_block2above[25] = {
-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25] = {
+  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8},
+  {0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}
 };
-
-const unsigned char vp9_block2left_8x8[25] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-const unsigned char vp9_block2above_8x8[25] = {
-  0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -45,6 +45,19 @@ void vpx_log(const char *format, ...);
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
 #define MAX_MV_REFS 9
+#define MAX_MV_REF_CANDIDATES 4
+
+#if CONFIG_DWTDCTHYBRID
+#define DWT_MAX_LENGTH     64
+#define DWT_TYPE           26    // 26/53/97
+#define DWT_PRECISION_BITS 2
+#define DWT_PRECISION_RND  ((1 << DWT_PRECISION_BITS) / 2)
+
+#define DWTDCT16X16        0
+#define DWTDCT16X16_LEAN   1
+#define DWTDCT8X8          2
+#define DWTDCT_TYPE        DWTDCT16X16_LEAN
+#endif

 typedef struct {
  int r, c;
@ -65,11 +78,6 @@ typedef struct {
  ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;

-extern const unsigned char vp9_block2left[25];
-extern const unsigned char vp9_block2above[25];
-extern const unsigned char vp9_block2left_8x8[25];
-extern const unsigned char vp9_block2above_8x8[25];
-
 #define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \
  Dest = ((A)!=0) + ((B)!=0);

@ -80,10 +88,13 @@ typedef enum {

 typedef enum
 {
-  SIXTAP   = 0,
-  BILINEAR = 1,
-  EIGHTTAP = 2,
-  EIGHTTAP_SHARP = 3,
+#if CONFIG_ENABLE_6TAP
+  SIXTAP,
+#endif
+  EIGHTTAP_SMOOTH,
+  EIGHTTAP,
+  EIGHTTAP_SHARP,
+  BILINEAR,
  SWITCHABLE  /* should be the last one */
 } INTERPOLATIONFILTERTYPE;

@ -101,13 +112,11 @@ typedef enum
  TM_PRED,            /* Truemotion prediction */
  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */
  B_PRED,             /* block based prediction, each block has its own prediction mode */
-
  NEARESTMV,
  NEARMV,
  ZEROMV,
  NEWMV,
  SPLITMV,
-
  MB_MODE_COUNT
 } MB_PREDICTION_MODE;

@ -120,15 +129,16 @@ typedef enum {
  SEG_LVL_EOB = 4,                 // EOB end stop marker.
  SEG_LVL_TRANSFORM = 5,           // Block transform size.
  SEG_LVL_MAX = 6                  // Number of MB level features supported
-
 } SEG_LVL_FEATURES;

 // Segment level features.
 typedef enum {
-  TX_4X4,                      // 4x4 dct transform
-  TX_8X8,                      // 8x8 dct transform
-  TX_16X16,                    // 16x16 dct transform
-  TX_SIZE_MAX                  // Number of different transforms available
+  TX_4X4 = 0,                      // 4x4 dct transform
+  TX_8X8 = 1,                      // 8x8 dct transform
+  TX_16X16 = 2,                    // 16x16 dct transform
+  TX_SIZE_MAX_MB = 3,              // Number of different transforms available
+  TX_32X32 = TX_SIZE_MAX_MB,       // 32x32 dct transform
+  TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
 } TX_SIZE;

 typedef enum {
@ -205,9 +215,6 @@ union b_mode_info {
  struct {
    B_PREDICTION_MODE first;
    TX_TYPE           tx_type;
-#if CONFIG_COMP_INTRA_PRED
-    B_PREDICTION_MODE second;
-#endif
 #if CONFIG_NEWBINTRAMODES
    B_PREDICTION_MODE context;
 #endif
@ -227,18 +234,21 @@ typedef enum {
  MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;

+typedef enum {
+  BLOCK_SIZE_MB16X16 = 0,
+  BLOCK_SIZE_SB32X32 = 1,
+  BLOCK_SIZE_SB64X64 = 2,
+} BLOCK_SIZE_TYPE;
+
 typedef struct {
  MB_PREDICTION_MODE mode, uv_mode;
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE second_mode, second_uv_mode;
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
  MB_PREDICTION_MODE interintra_mode, interintra_uv_mode;
 #endif
  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
  TX_SIZE txfm_size;
  int_mv mv[2]; // for each reference frame used
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  int_mv best_mv, best_second_mv;
 #if CONFIG_NEW_MVREF
  int best_index, best_second_index;
@ -261,17 +271,9 @@ typedef struct {
  // a valid predictor
  unsigned char mb_in_image;

-#if CONFIG_PRED_FILTER
-  // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level
-  unsigned int pred_filter_enabled;
-#endif
-    INTERPOLATIONFILTERTYPE interp_filter;
+  INTERPOLATIONFILTERTYPE interp_filter;

-#if CONFIG_SUPERBLOCKS
-  // FIXME need a SB array of 4 MB_MODE_INFOs that
-  // only needs one encoded_as_sb.
-  unsigned char encoded_as_sb;
-#endif
+  BLOCK_SIZE_TYPE sb_type;
 } MB_MODE_INFO;

 typedef struct {
@ -280,19 +282,19 @@ typedef struct {
 } MODE_INFO;

 typedef struct blockd {
-  short *qcoeff;
-  short *dqcoeff;
-  unsigned char  *predictor;
-  short *diff;
-  short *dequant;
+  int16_t *qcoeff;
+  int16_t *dqcoeff;
+  uint8_t *predictor;
+  int16_t *diff;
+  int16_t *dequant;

  /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
-  unsigned char **base_pre;
-  unsigned char **base_second_pre;
+  uint8_t **base_pre;
+  uint8_t **base_second_pre;
  int pre;
  int pre_stride;

-  unsigned char **base_dst;
+  uint8_t **base_dst;
  int dst;
  int dst_stride;

@ -301,12 +303,21 @@ typedef struct blockd {
  union b_mode_info bmi;
 } BLOCKD;

+typedef struct superblockd {
+  /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */
+  DECLARE_ALIGNED(16, int16_t, diff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, int16_t, qcoeff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, int16_t, dqcoeff[32*32+16*16*2]);
+} SUPERBLOCKD;
+
 typedef struct macroblockd {
-  DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
-  DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-  DECLARE_ALIGNED(16, short, qcoeff[400]);
-  DECLARE_ALIGNED(16, short, dqcoeff[400]);
-  DECLARE_ALIGNED(16, unsigned short,  eobs[25]);
+  DECLARE_ALIGNED(16, int16_t,  diff[400]);      /* from idct diff */
+  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);
+  DECLARE_ALIGNED(16, int16_t,  qcoeff[400]);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[400]);
+  DECLARE_ALIGNED(16, uint16_t, eobs[25]);
+
+  SUPERBLOCKD sb_coeff_data;

  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
  BLOCKD block[25];
@ -350,7 +361,7 @@ typedef struct macroblockd {
  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];

 #if CONFIG_NEW_MVREF
-  vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];
+  vp9_prob mb_mv_ref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];
 #endif

  // Segment features
@ -377,17 +388,17 @@ typedef struct macroblockd {
  unsigned int frames_till_alt_ref_frame;

  /* Inverse transform function pointers. */
-  void (*inv_xform4x4_1_x8)(short *input, short *output, int pitch);
-  void (*inv_xform4x4_x8)(short *input, short *output, int pitch);
-  void (*inv_walsh4x4_1)(short *in, short *out);
-  void (*inv_walsh4x4_lossless)(short *in, short *out);
+  void (*inv_xform4x4_1_x8)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_xform4x4_x8)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
+  void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);


-  vp9_subpix_fn_t  subpixel_predict;
+  vp9_subpix_fn_t  subpixel_predict4x4;
  vp9_subpix_fn_t  subpixel_predict8x4;
  vp9_subpix_fn_t  subpixel_predict8x8;
  vp9_subpix_fn_t  subpixel_predict16x16;
-  vp9_subpix_fn_t  subpixel_predict_avg;
+  vp9_subpix_fn_t  subpixel_predict_avg4x4;
  vp9_subpix_fn_t  subpixel_predict_avg8x4;
  vp9_subpix_fn_t  subpixel_predict_avg8x8;
  vp9_subpix_fn_t  subpixel_predict_avg16x16;
@ -395,14 +406,7 @@ typedef struct macroblockd {

  int corrupted;

-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  /* This is an intermediate buffer currently used in sub-pixel motion search
-   * to keep a copy of the reference area. This buffer can be used for other
-   * purpose.
-   */
-  DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);
-#endif
-
+  int sb_index;
  int mb_index;   // Index of the MB in the SB (0..3)
  int q_index;

@ -490,6 +494,9 @@ static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
  return tx_type;
 }

+extern const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25];
+extern const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25];
+
 #define USE_ADST_FOR_I16X16_8X8   0
 #define USE_ADST_FOR_I16X16_4X4   0
 #define USE_ADST_FOR_I8X8_4X4     1
@ -502,11 +509,9 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
  int ib = (int)(b - xd->block);
  if (ib >= 16)
    return tx_type;
-#if CONFIG_SUPERBLOCKS
  // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
    return tx_type;
-#endif
  if (xd->mode_info_context->mbmi.mode == B_PRED &&
      xd->q_index < ACTIVE_HT) {
    tx_type = txfm_map(
@ -559,11 +564,9 @@ static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
  int ib = (int)(b - xd->block);
  if (ib >= 16)
    return tx_type;
-#if CONFIG_SUPERBLOCKS
  // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
    return tx_type;
-#endif
  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
      xd->q_index < ACTIVE_HT8) {
    // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged
@ -594,11 +597,9 @@ static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
  int ib = (int)(b - xd->block);
  if (ib >= 16)
    return tx_type;
-#if CONFIG_SUPERBLOCKS
  // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
    return tx_type;
-#endif
  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
      xd->q_index < ACTIVE_HT16) {
    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
@ -650,4 +651,4 @@ static void update_blockd_bmi(MACROBLOCKD *xd) {
    }
  }
 }
-#endif  /* __INC_BLOCKD_H */
+#endif  // VP9_COMMON_VP9_BLOCKD_H_
--- a/vp9/common/vp9_coefupdateprobs.h
+++ b/vp9/common/vp9_coefupdateprobs.h
@ -8,9 +8,13 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#ifndef VP9_COMMON_VP9_COEFUPDATEPROBS_H_
+#define VP9_COMMON_VP9_COEFUPDATEPROBS_H__

 /* Update probabilities for the nodes in the token entropy tree.
   Generated file included by vp9_entropy.c */
 #define COEF_UPDATE_PROB 252
 #define COEF_UPDATE_PROB_8X8 252
 #define COEF_UPDATE_PROB_16X16 252
+
+#endif  // VP9_COMMON_VP9_COEFUPDATEPROBS_H__
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_COMMON_H_
 #define VP9_COMMON_VP9_COMMON_H_

@ -17,25 +16,34 @@
 /* Interface header for common constant data structures and lookup tables */

 #include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_integer.h"

-#include "vp9/common/vp9_common_types.h"
+#define TRUE    1
+#define FALSE   0
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))

 /* Only need this for fixed-size arrays, for structs just assign. */

-#define vp9_copy( Dest, Src) { \
-    assert( sizeof( Dest) == sizeof( Src)); \
-    vpx_memcpy( Dest, Src, sizeof( Src)); \
+#define vp9_copy(Dest, Src) { \
+    assert(sizeof(Dest) == sizeof(Src)); \
+    vpx_memcpy(Dest, Src, sizeof(Src)); \
  }

 /* Use this for variably-sized arrays. */

-#define vp9_copy_array( Dest, Src, N) { \
-    assert( sizeof( *Dest) == sizeof( *Src)); \
-    vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+#define vp9_copy_array(Dest, Src, N) { \
+    assert(sizeof(*Dest) == sizeof(*Src)); \
+    vpx_memcpy(Dest, Src, N * sizeof(*Src)); \
  }

-#define vp9_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+#define vp9_zero(Dest) vpx_memset(&Dest, 0, sizeof(Dest));

-#define vp9_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+#define vp9_zero_array(Dest, N) vpx_memset(Dest, 0, N * sizeof(*Dest));

-#endif  /* common_h */
+static __inline uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255u : (val < 0) ? 0u : val;
+}
+
+#endif  // VP9_COMMON_VP9_COMMON_H_
--- a/vp9/common/vp9_common_types.h
+++ b/vp9/common/vp9_common_types.h
@ -1,18 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_VP9_COMMON_TYPES_H_
-#define VP9_COMMON_VP9_COMMON_TYPES_H_
-
-#define TRUE    1
-#define FALSE   0
-
-#endif
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@ -87,9 +87,6 @@ void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,

        if (mi[mb_index].mbmi.mode == B_PRED) {
          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);
-#if CONFIG_COMP_INTRA_PRED
-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);
-#endif
        } else
          fprintf(mvs, "xx ");

--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@ -8,10 +8,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_ENTROPY_H_
 #define VP9_COMMON_VP9_ENTROPY_H_

+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
@ -55,24 +55,27 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
 #define PROB_UPDATE_BASELINE_COST   7

 #define MAX_PROB                255
-#define DCT_MAX_VALUE           8192
+#define DCT_MAX_VALUE           16384

 /* Coefficients are predicted via a 3-dimensional probability table. */

 /* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
-#define BLOCK_TYPES 4
+#define BLOCK_TYPES_4X4 4

 #define BLOCK_TYPES_8X8 4

 #define BLOCK_TYPES_16X16 4

+#define BLOCK_TYPES_32X32 4
+
 /* Middle dimension is a coarsening of the coefficient's
   position within the 4x4 DCT. */

 #define COEF_BANDS 8
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]);
 extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
 extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]);

 /* Inside dimension is 3-valued measure of nearby complexity, that is,
   the extent to which nearby coefficients are nonzero.  For the first
@ -91,24 +94,61 @@ extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
   distinct bands). */

 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
-#define PREV_COEF_CONTEXTS       4
+#define PREV_COEF_CONTEXTS          4
+
+typedef unsigned int vp9_coeff_count[COEF_BANDS][PREV_COEF_CONTEXTS]
+                                    [MAX_ENTROPY_TOKENS];
+typedef unsigned int vp9_coeff_stats[COEF_BANDS][PREV_COEF_CONTEXTS]
+                                    [ENTROPY_NODES][2];
+typedef vp9_prob vp9_coeff_probs[COEF_BANDS][PREV_COEF_CONTEXTS]
+                                [ENTROPY_NODES];

 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
 #define MODULUS_PARAM               13  /* Modulus parameter */

-extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
+extern DECLARE_ALIGNED(16, const uint8_t,
+                       vp9_prev_token_class[MAX_ENTROPY_TOKENS]);

 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]);

-extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);

 extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
-void vp9_coef_tree_initialize(void);
-
 extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]);
+
+void vp9_coef_tree_initialize(void);
 void vp9_adapt_coef_probs(struct VP9Common *);

-#endif
+static void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
+  /* Clear entropy contexts */
+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+}
+
+#if CONFIG_NEWCOEFCONTEXT
+
+#define MAX_NEIGHBORS 5
+#define NEWCOEFCONTEXT_BAND_COND(b)   ((b) >= 1)
+void vp9_init_neighbors(void);
+
+const int *vp9_get_coef_neighbors_handle(const int *scan);
+int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,
+                                  const int *neigbor_handle, int rc);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_4x4_neighbors[
+                       16 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_row_scan_4x4_neighbors[
+                       16 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_col_scan_4x4_neighbors[
+                       16 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_8x8_neighbors[
+                       64 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_16x16_neighbors[
+                       256 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_32x32_neighbors[
+                       1024 * MAX_NEIGHBORS]);
+#endif  // CONFIG_NEWCOEFCONTEXT
+#endif  // VP9_COMMON_VP9_ENTROPY_H_
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@ -272,13 +272,11 @@ const vp9_tree_index vp9_mv_ref_tree[8] = {
  -NEWMV, -SPLITMV
 };

-#if CONFIG_SUPERBLOCKS
 const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
  -ZEROMV, 2,
  -NEARESTMV, 4,
  -NEARMV, -NEWMV
 };
-#endif

 const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
  -LEFT4X4, 2,
@ -289,19 +287,15 @@ const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
 struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES];
 struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES];
 struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
-#if CONFIG_SUPERBLOCKS
 struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES];
 struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
-#endif
 struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
 struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
 struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
 struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];

 struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
-#if CONFIG_SUPERBLOCKS
 struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
-#endif
 struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];

 void vp9_init_mbmode_probs(VP9_COMMON *x) {
@ -309,25 +303,21 @@ void vp9_init_mbmode_probs(VP9_COMMON *x) {

  vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,
                                   vp9_ymode_tree, x->fc.ymode_prob,
-                                   bct, y_mode_cts, 256, 1);
-#if CONFIG_SUPERBLOCKS
+                                   bct, y_mode_cts);
  vp9_tree_probs_from_distribution(VP9_I32X32_MODES, vp9_sb_ymode_encodings,
                                   vp9_sb_ymode_tree, x->fc.sb_ymode_prob,
-                                   bct, y_mode_cts, 256, 1);
-#endif
+                                   bct, y_mode_cts);
  {
    int i;
    for (i = 0; i < 8; i++) {
      vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,
                                       vp9_kf_ymode_tree, x->kf_ymode_prob[i],
-                                       bct, kf_y_mode_cts[i], 256, 1);
-#if CONFIG_SUPERBLOCKS
+                                       bct, kf_y_mode_cts[i]);
      vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
                                       vp9_sb_kf_ymode_encodings,
                                       vp9_sb_kf_ymode_tree,
                                       x->sb_kf_ymode_prob[i], bct,
-                                       kf_y_mode_cts[i], 256, 1);
-#endif
+                                       kf_y_mode_cts[i]);
    }
  }
  {
@ -335,16 +325,16 @@ void vp9_init_mbmode_probs(VP9_COMMON *x) {
    for (i = 0; i < VP9_YMODES; i++) {
      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
                                       vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
-                                       bct, kf_uv_mode_cts[i], 256, 1);
+                                       bct, kf_uv_mode_cts[i]);
      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
                                       vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
-                                       bct, uv_mode_cts[i], 256, 1);
+                                       bct, uv_mode_cts[i]);
    }
  }

  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
                                   vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
-                                   bct, i8x8_mode_cts, 256, 1);
+                                   bct, i8x8_mode_cts);

  vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
             sizeof(vp9_sub_mv_ref_prob2));
@ -362,7 +352,7 @@ static void intra_bmode_probs_from_distribution(
  unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2],
  const unsigned int events[VP9_NKF_BINTRAMODES]) {
  vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings,
-    vp9_bmode_tree, p, branch_ct, events, 256, 1);
+                                   vp9_bmode_tree, p, branch_ct, events);
 }

 void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) {
@ -375,7 +365,7 @@ static void intra_kf_bmode_probs_from_distribution(
  unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2],
  const unsigned int events[VP9_KF_BINTRAMODES]) {
  vp9_tree_probs_from_distribution(VP9_KF_BINTRAMODES, vp9_kf_bmode_encodings,
-    vp9_kf_bmode_tree, p, branch_ct, events, 256, 1);
+                                   vp9_kf_bmode_tree, p, branch_ct, events);
 }

 void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES]
@ -398,9 +388,15 @@ const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
  -1, -2
 };
 struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+#if CONFIG_ENABLE_6TAP
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};
+  SIXTAP, EIGHTTAP, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {0, -1, 1, 2, -1, -1};
+#else
+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
+  EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};
+#endif
 const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
                                          [VP9_SWITCHABLE_FILTERS-1] = {
  {248, 192}, { 32, 248}, { 32,  32}, {192, 160}
@ -418,7 +414,11 @@ const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
 };
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
  EIGHTTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s
+#if CONFIG_ENABLE_6TAP
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1, -1};
+#else
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1};
+#endif
 #endif

 void vp9_entropy_mode_init() {
@ -426,10 +426,8 @@ void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);
  vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);
  vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);
-#if CONFIG_SUPERBLOCKS
  vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree);
  vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree);
-#endif
  vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
@ -438,10 +436,8 @@ void vp9_entropy_mode_init() {

  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
                              vp9_mv_ref_tree, NEARESTMV);
-#if CONFIG_SUPERBLOCKS
  vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
                              vp9_sb_mv_ref_tree, NEARESTMV);
-#endif
  vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,
                              vp9_sub_mv_ref_tree, LEFT4X4);
 }
@ -495,17 +491,14 @@ void vp9_update_mode_context(VP9_COMMON *pc) {

  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
    for (i = 0; i < 4; i++) {
-      int this_prob;
-      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-      int factor;
-      {
-        this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;
-        count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
-        factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
-        this_prob = (pc->fc.vp9_mode_contexts[j][i] * (256 - factor) +
-                     this_prob * factor + 128) >> 8;
-        mode_context[j][i] = clip_prob(this_prob);
-      }
+      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1], factor;
+
+      count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
+      factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
+      mode_context[j][i] = weighted_prob(pc->fc.vp9_mode_contexts[j][i],
+                                         get_binary_prob(mv_ref_ct[j][i][0],
+                                                         mv_ref_ct[j][i][1]),
+                                         factor);
    }
  }
 }
@ -531,25 +524,33 @@ void print_mode_contexts(VP9_COMMON *pc) {
 }
 #endif

-// #define MODE_COUNT_TESTING
 #define MODE_COUNT_SAT 20
 #define MODE_MAX_UPDATE_FACTOR 144
+static void update_mode_probs(int n_modes, struct vp9_token_struct *encoding,
+                              const vp9_tree_index *tree, unsigned int *cnt,
+                              vp9_prob *pre_probs, vp9_prob *dst_probs) {
+#define MAX_PROBS 32
+  vp9_prob probs[MAX_PROBS];
+  unsigned int branch_ct[MAX_PROBS][2];
+  int t, count, factor;
+
+  assert(n_modes - 1 < MAX_PROBS);
+  vp9_tree_probs_from_distribution(n_modes, encoding, tree, probs,
+                                   branch_ct, cnt);
+  for (t = 0; t < n_modes - 1; ++t) {
+    count = branch_ct[t][0] + branch_ct[t][1];
+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+    dst_probs[t] = weighted_prob(pre_probs[t], probs[t], factor);
+  }
+}
+
+// #define MODE_COUNT_TESTING
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
-  int i, t, count, factor;
-  unsigned int branch_ct[32][2];
-  vp9_prob ymode_probs[VP9_YMODES - 1];
-#if CONFIG_SUPERBLOCKS
-  vp9_prob sb_ymode_probs[VP9_I32X32_MODES - 1];
-#endif
-  vp9_prob uvmode_probs[VP9_UV_MODES - 1];
-  vp9_prob bmode_probs[VP9_NKF_BINTRAMODES - 1];
-  vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];
-  vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];
-#if CONFIG_COMP_INTERINTRA_PRED
-  vp9_prob interintra_prob;
-#endif
+  int i;
 #ifdef MODE_COUNT_TESTING
+  int t;
+
  printf("static const unsigned int\nymode_counts"
         "[VP9_YMODES] = {\n");
  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
@ -590,116 +591,43 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
  printf("};\n");
 #endif
 #endif
-  vp9_tree_probs_from_distribution(
-    VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
-    ymode_probs, branch_ct, cm->fc.ymode_counts,
-    256, 1);
-  for (t = 0; t < VP9_YMODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +
-            (int)ymode_probs[t] * factor + 128) >> 8;
-    cm->fc.ymode_prob[t] = clip_prob(prob);
-  }
-#if CONFIG_SUPERBLOCKS
-  vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
-                                   vp9_sb_ymode_encodings, vp9_sb_ymode_tree,
-                                   sb_ymode_probs, branch_ct,
-                                   cm->fc.sb_ymode_counts,
-                                   256, 1);
-  for (t = 0; t < VP9_I32X32_MODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_sb_ymode_prob[t] * (256 - factor) +
-            (int)sb_ymode_probs[t] * factor + 128) >> 8;
-    cm->fc.sb_ymode_prob[t] = clip_prob(prob);
-  }
-#endif
+
+  update_mode_probs(VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
+                    cm->fc.ymode_counts, cm->fc.pre_ymode_prob,
+                    cm->fc.ymode_prob);
+  update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_encodings, vp9_sb_ymode_tree,
+                    cm->fc.sb_ymode_counts, cm->fc.pre_sb_ymode_prob,
+                    cm->fc.sb_ymode_prob);
  for (i = 0; i < VP9_YMODES; ++i) {
-    vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
-                                     vp9_uv_mode_tree, uvmode_probs, branch_ct,
-                                     cm->fc.uv_mode_counts[i], 256, 1);
-    for (t = 0; t < VP9_UV_MODES - 1; ++t) {
-      int prob;
-      count = branch_ct[t][0] + branch_ct[t][1];
-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-      prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +
-              (int)uvmode_probs[t] * factor + 128) >> 8;
-      cm->fc.uv_mode_prob[i][t] = clip_prob(prob);
-    }
-  }
-  vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings,
-                                   vp9_bmode_tree, bmode_probs, branch_ct,
-                                   cm->fc.bmode_counts, 256, 1);
-  for (t = 0; t < VP9_NKF_BINTRAMODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +
-            (int)bmode_probs[t] * factor + 128) >> 8;
-    cm->fc.bmode_prob[t] = clip_prob(prob);
-  }
-  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
-                                   vp9_i8x8_mode_tree, i8x8_mode_probs,
-                                   branch_ct, cm->fc.i8x8_mode_counts, 256, 1);
-  for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +
-            (int)i8x8_mode_probs[t] * factor + 128) >> 8;
-    cm->fc.i8x8_mode_prob[t] = clip_prob(prob);
+    update_mode_probs(VP9_UV_MODES, vp9_uv_mode_encodings, vp9_uv_mode_tree,
+                      cm->fc.uv_mode_counts[i], cm->fc.pre_uv_mode_prob[i],
+                      cm->fc.uv_mode_prob[i]);
  }
+  update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, vp9_bmode_tree,
+                    cm->fc.bmode_counts, cm->fc.pre_bmode_prob,
+                    cm->fc.bmode_prob);
+  update_mode_probs(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
+                    vp9_i8x8_mode_tree, cm->fc.i8x8_mode_counts,
+                    cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob);
  for (i = 0; i < SUBMVREF_COUNT; ++i) {
-    vp9_tree_probs_from_distribution(VP9_SUBMVREFS,
-                                     vp9_sub_mv_ref_encoding_array,
-                                     vp9_sub_mv_ref_tree, sub_mv_ref_probs,
-                                     branch_ct, cm->fc.sub_mv_ref_counts[i],
-                                     256, 1);
-    for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {
-      int prob;
-      count = branch_ct[t][0] + branch_ct[t][1];
-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-      prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +
-              (int)sub_mv_ref_probs[t] * factor + 128) >> 8;
-      cm->fc.sub_mv_ref_prob[i][t] = clip_prob(prob);
-    }
-  }
-  vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,
-                                   vp9_mbsplit_tree, mbsplit_probs, branch_ct,
-                                   cm->fc.mbsplit_counts, 256, 1);
-  for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +
-            (int)mbsplit_probs[t] * factor + 128) >> 8;
-    cm->fc.mbsplit_prob[t] = clip_prob(prob);
+    update_mode_probs(VP9_SUBMVREFS, vp9_sub_mv_ref_encoding_array,
+                      vp9_sub_mv_ref_tree, cm->fc.sub_mv_ref_counts[i],
+                      cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i]);
  }
+  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_encodings, vp9_mbsplit_tree,
+                    cm->fc.mbsplit_counts, cm->fc.pre_mbsplit_prob,
+                    cm->fc.mbsplit_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
  if (cm->use_interintra) {
-    int prob;
-    interintra_prob = vp9_bin_prob_from_distribution(cm->fc.interintra_counts);
+    int factor, interintra_prob, count;
+
+    interintra_prob = get_binary_prob(cm->fc.interintra_counts[0],
+                                      cm->fc.interintra_counts[1]);
    count = cm->fc.interintra_counts[0] + cm->fc.interintra_counts[1];
    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_interintra_prob * (256 - factor) +
-            (int)interintra_prob * factor + 128) >> 8;
-    if (prob <= 0)
-      cm->fc.interintra_prob = 1;
-    else if (prob > 255)
-      cm->fc.interintra_prob = 255;
-    else
-      cm->fc.interintra_prob = prob;
+    cm->fc.interintra_prob = weighted_prob(cm->fc.pre_interintra_prob,
+                                           interintra_prob, factor);
  }
 #endif
 }
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
 #define VP9_COMMON_VP9_ENTROPYMODE_H_

@ -17,9 +16,6 @@

 #define SUBMVREF_COUNT 5
 #define VP9_NUMMBSPLITS 4
-#if CONFIG_COMP_INTRA_PRED
-#define DEFAULT_COMP_INTRA_PROB  32
-#endif

 #if CONFIG_COMP_INTERINTRA_PRED
 #define VP9_DEF_INTERINTRA_PROB 248
@ -98,7 +94,7 @@ void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_KF_BINTRAMODES]

 void vp9_adapt_mode_probs(struct VP9Common *);

-#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */
+#define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */

 extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp
                  [VP9_SWITCHABLE_FILTERS];
@ -114,4 +110,4 @@ extern struct vp9_token_struct vp9_switchable_interp_encodings
 extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                                 [VP9_SWITCHABLE_FILTERS - 1];

-#endif
+#endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@ -213,16 +213,12 @@ void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,

 static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,
                       unsigned int ct[2]) {
-  int factor;
-  int prob;
  int count = ct[0] + ct[1];
+
  if (count) {
    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-    prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;
-    prob += !prob;
-    prob = (prob > 255 ? 255 : prob);
-    *dest = prob;
+    *dest = weighted_prob(prep, newp,
+                          MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
  }
 }

@ -251,11 +247,10 @@ void vp9_counts_to_nmv_context(
                                   vp9_mv_joint_tree,
                                   prob->joints,
                                   branch_ct_joint,
-                                   NMVcount->joints,
-                                   256, 1);
+                                   NMVcount->joints);
  for (i = 0; i < 2; ++i) {
-    prob->comps[i].sign =
-        vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);
+    prob->comps[i].sign = get_binary_prob(NMVcount->comps[i].sign[0],
+                                          NMVcount->comps[i].sign[1]);
    branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];
    branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];
    vp9_tree_probs_from_distribution(MV_CLASSES,
@ -263,18 +258,16 @@ void vp9_counts_to_nmv_context(
                                     vp9_mv_class_tree,
                                     prob->comps[i].classes,
                                     branch_ct_classes[i],
-                                     NMVcount->comps[i].classes,
-                                     256, 1);
+                                     NMVcount->comps[i].classes);
    vp9_tree_probs_from_distribution(CLASS0_SIZE,
                                     vp9_mv_class0_encodings,
                                     vp9_mv_class0_tree,
                                     prob->comps[i].class0,
                                     branch_ct_class0[i],
-                                     NMVcount->comps[i].class0,
-                                     256, 1);
+                                     NMVcount->comps[i].class0);
    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(
-          NMVcount->comps[i].bits[j]);
+      prob->comps[i].bits[j] = get_binary_prob(NMVcount->comps[i].bits[j][0],
+                                               NMVcount->comps[i].bits[j][1]);
      branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];
      branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];
    }
@ -286,26 +279,25 @@ void vp9_counts_to_nmv_context(
                                       vp9_mv_fp_tree,
                                       prob->comps[i].class0_fp[k],
                                       branch_ct_class0_fp[i][k],
-                                       NMVcount->comps[i].class0_fp[k],
-                                       256, 1);
+                                       NMVcount->comps[i].class0_fp[k]);
    }
    vp9_tree_probs_from_distribution(4,
                                     vp9_mv_fp_encodings,
                                     vp9_mv_fp_tree,
                                     prob->comps[i].fp,
                                     branch_ct_fp[i],
-                                     NMVcount->comps[i].fp,
-                                     256, 1);
+                                     NMVcount->comps[i].fp);
  }
  if (usehp) {
    for (i = 0; i < 2; ++i) {
-      prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(
-          NMVcount->comps[i].class0_hp);
+      prob->comps[i].class0_hp =
+          get_binary_prob(NMVcount->comps[i].class0_hp[0],
+                          NMVcount->comps[i].class0_hp[1]);
      branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];
      branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];

-      prob->comps[i].hp =
-          vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);
+      prob->comps[i].hp = get_binary_prob(NMVcount->comps[i].hp[0],
+                                          NMVcount->comps[i].hp[1]);
      branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];
      branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];
    }
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@ -25,6 +25,13 @@ void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
 int vp9_use_nmv_hp(const MV *ref);

 #define VP9_NMV_UPDATE_PROB  255
+
+#if CONFIG_NEW_MVREF
+#define VP9_MVREF_UPDATE_PROB 252
+#define VP9_DEFAULT_MV_REF_PROB 192
+#define VP9_MV_REF_UPDATE_COST (14 << 8)
+#endif
+
 //#define MV_GROUP_UPDATE

 #define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */
@ -126,4 +133,5 @@ void vp9_counts_to_nmv_context(
    unsigned int (*branch_ct_class0_hp)[2],
    unsigned int (*branch_ct_hp)[2]);
 void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
-#endif
+
+#endif  // VP9_COMMON_VP9_ENTROPYMV_H_
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c
@ -11,9 +11,9 @@
 #include "vp9/common/vp9_extend.h"
 #include "vpx_mem/vpx_mem.h"

-static void copy_and_extend_plane(unsigned char *s, /* source */
+static void copy_and_extend_plane(uint8_t *s,       /* source */
                                  int sp,           /* source pitch */
-                                  unsigned char *d, /* destination */
+                                  uint8_t *d,       /* destination */
                                  int dp,           /* destination pitch */
                                  int h,            /* height */
                                  int w,            /* width */
@ -22,8 +22,8 @@ static void copy_and_extend_plane(unsigned char *s, /* source */
                                  int eb,           /* extend bottom border */
                                  int er) {         /* extend right border */
  int i;
-  unsigned char *src_ptr1, *src_ptr2;
-  unsigned char *dest_ptr1, *dest_ptr2;
+  uint8_t *src_ptr1, *src_ptr2;
+  uint8_t *dest_ptr1, *dest_ptr2;
  int linesize;

  /* copy the left and right most columns out */
@ -143,8 +143,8 @@ void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
 }

 /* note the extension is only for the last row, for intra prediction purpose */
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
-                       unsigned char *UPtr, unsigned char *VPtr) {
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr,
+                       uint8_t *UPtr, uint8_t *VPtr) {
  int i;

  YPtr += ybf->y_stride * 14;
--- a/vp9/common/vp9_extend.h
+++ b/vp9/common/vp9_extend.h
@ -12,9 +12,10 @@
 #define VP9_COMMON_VP9_EXTEND_H_

 #include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"

-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
-                       unsigned char *UPtr, unsigned char *VPtr);
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr,
+                       uint8_t *UPtr, uint8_t *VPtr);

 void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
                               YV12_BUFFER_CONFIG *dst);
@ -24,4 +25,4 @@ void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
                                         int srcy, int srcx,
                                         int srch, int srcw);

-#endif  // __INC_EXTEND_H
+#endif  // VP9_COMMON_VP9_EXTEND_H_
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@ -13,6 +13,7 @@

 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"

 #define BLOCK_HEIGHT_WIDTH 4
 #define VP9_FILTER_WEIGHT 128
@ -20,9 +21,10 @@

 #define SUBPEL_SHIFTS 16

-extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];
-extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
-extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
-extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
+extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];

-#endif // FILTER_H
+#endif  // VP9_COMMON_VP9_FILTER_H_
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@ -14,7 +14,7 @@
 #include "vp9/common/vp9_subpelvar.h"
 #include <limits.h>

-const unsigned char vp9_mbsplit_offset[4][16] = {
+const uint8_t vp9_mbsplit_offset[4][16] = {
  { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
  { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
  { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
@ -42,23 +42,23 @@ vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
 }

 #define SP(x) (((x) & 7) << 1)
-unsigned int vp9_sad3x16_c(const unsigned char *src_ptr,
+unsigned int vp9_sad3x16_c(const uint8_t *src_ptr,
                           int  src_stride,
-                           const unsigned char *ref_ptr,
+                           const uint8_t *ref_ptr,
                           int  ref_stride) {
  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
 }
-unsigned int vp9_sad16x3_c(const unsigned char *src_ptr,
+unsigned int vp9_sad16x3_c(const uint8_t *src_ptr,
                           int  src_stride,
-                           const unsigned char *ref_ptr,
+                           const uint8_t *ref_ptr,
                           int  ref_stride) {
  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
 }

-#if CONFIG_SUBPELREFMV
-unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,
+
+unsigned int vp9_variance2x16_c(const uint8_t *src_ptr,
                                const int  source_stride,
-                                const unsigned char *ref_ptr,
+                                const uint8_t *ref_ptr,
                                const int  recon_stride,
                                unsigned int *sse) {
  int sum;
@ -66,9 +66,9 @@ unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,
  return (*sse - (((unsigned int)sum * sum) >> 5));
 }

-unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,
+unsigned int vp9_variance16x2_c(const uint8_t *src_ptr,
                                const int  source_stride,
-                                const unsigned char *ref_ptr,
+                                const uint8_t *ref_ptr,
                                const int  recon_stride,
                                unsigned int *sse) {
  int sum;
@ -76,16 +76,16 @@ unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,
  return (*sse - (((unsigned int)sum * sum) >> 5));
 }

-unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
                                          const int  src_pixels_per_line,
                                          const int  xoffset,
                                          const int  yoffset,
-                                          const unsigned char *dst_ptr,
+                                          const uint8_t *dst_ptr,
                                          const int dst_pixels_per_line,
                                          unsigned int *sse) {
-  unsigned short FData3[16 * 3];  // Temp data buffer used in filtering
-  unsigned char  temp2[2 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[16 * 3];  // Temp data buffer used in filtering
+  uint8_t temp2[2 * 16];
+  const int16_t *HFilter, *VFilter;

  HFilter = vp9_bilinear_filters[xoffset];
  VFilter = vp9_bilinear_filters[yoffset];
@ -97,16 +97,16 @@ unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char  *src_ptr,
  return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }

-unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
                                          const int  src_pixels_per_line,
                                          const int  xoffset,
                                          const int  yoffset,
-                                          const unsigned char *dst_ptr,
+                                          const uint8_t *dst_ptr,
                                          const int dst_pixels_per_line,
                                          unsigned int *sse) {
-  unsigned short FData3[2 * 17];  // Temp data buffer used in filtering
-  unsigned char  temp2[2 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[2 * 17];  // Temp data buffer used in filtering
+  uint8_t temp2[2 * 16];
+  const int16_t *HFilter, *VFilter;

  HFilter = vp9_bilinear_filters[xoffset];
  VFilter = vp9_bilinear_filters[yoffset];
@ -117,51 +117,46 @@ unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char  *src_ptr,

  return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
 }
-#endif

 /* check a list of motion vectors by sad score using a number rows of pixels
 * above and a number cols of pixels in the left to select the one with best
 * score to use as ref motion vector
 */
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           unsigned char *ref_y_buffer,
+                           uint8_t *ref_y_buffer,
                           int ref_y_stride,
                           int_mv *mvlist,
-                           int_mv *best_mv,
                           int_mv *nearest,
                           int_mv *near) {
  int i, j;
-  unsigned char *above_src;
-  unsigned char *left_src;
-  unsigned char *above_ref;
-  unsigned char *left_ref;
-  unsigned int score;
-#if CONFIG_SUBPELREFMV
-  unsigned int sse;
+  uint8_t *above_src;
+  uint8_t *above_ref;
+#if !CONFIG_ABOVESPREFMV
+  uint8_t *left_src;
+  uint8_t *left_ref;
 #endif
-  unsigned int ref_scores[MAX_MV_REFS] = {0};
-  int_mv sorted_mvs[MAX_MV_REFS];
+  unsigned int score;
+  unsigned int sse;
+  unsigned int ref_scores[MAX_MV_REF_CANDIDATES] = {0};
+  int_mv sorted_mvs[MAX_MV_REF_CANDIDATES];
  int zero_seen = FALSE;

  // Default all to 0,0 if nothing else available
-  best_mv->as_int = nearest->as_int = near->as_int = 0;
+  nearest->as_int = near->as_int = 0;
  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));

-#if CONFIG_SUBPELREFMV
  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
-  left_src  = xd->dst.y_buffer - 2;
  above_ref = ref_y_buffer - ref_y_stride * 2;
-  left_ref  = ref_y_buffer - 2;
+#if CONFIG_ABOVESPREFMV
+  above_src -= 4;
+  above_ref -= 4;
 #else
-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;
-  left_src  = xd->dst.y_buffer - 3;
-  above_ref = ref_y_buffer - ref_y_stride * 3;
-  left_ref  = ref_y_buffer - 3;
+  left_src  = xd->dst.y_buffer - 2;
+  left_ref  = ref_y_buffer - 2;
 #endif

-  //for(i = 0; i < MAX_MV_REFS; ++i) {
-  // Limit search to the predicted best 4
-  for(i = 0; i < 4; ++i) {
+  // Limit search to the predicted best few candidates
+  for(i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
    int_mv this_mv;
    int offset = 0;
    int row_offset, col_offset;
@ -175,42 +170,61 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

    zero_seen = zero_seen || !this_mv.as_int;

+#if !CONFIG_ABOVESPREFMV
    clamp_mv(&this_mv,
             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+#else
+    clamp_mv(&this_mv,
+             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,
+             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
+             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+#endif

-#if CONFIG_SUBPELREFMV
    row_offset = this_mv.as_mv.row >> 3;
    col_offset = this_mv.as_mv.col >> 3;
    offset = ref_y_stride * row_offset + col_offset;
    score = 0;
    if (xd->up_available) {
-      vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
+      vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
+                                 SP(this_mv.as_mv.col),
+                                 SP(this_mv.as_mv.row),
+                                 above_src, xd->dst.y_stride, &sse);
+      score += sse;
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
+        vp9_sub_pixel_variance16x2(above_ref + offset + 16,
+                                   ref_y_stride,
                                   SP(this_mv.as_mv.col),
                                   SP(this_mv.as_mv.row),
-                                   above_src, xd->dst.y_stride, &sse);
-      score += sse;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        vp9_sub_pixel_variance16x2_c(above_ref + offset + 16,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 16, xd->dst.y_stride, &sse);
+                                   above_src + 16, xd->dst.y_stride, &sse);
+        score += sse;
+      }
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        vp9_sub_pixel_variance16x2(above_ref + offset + 32,
+                                   ref_y_stride,
+                                   SP(this_mv.as_mv.col),
+                                   SP(this_mv.as_mv.row),
+                                   above_src + 32, xd->dst.y_stride, &sse);
+        score += sse;
+        vp9_sub_pixel_variance16x2(above_ref + offset + 48,
+                                   ref_y_stride,
+                                   SP(this_mv.as_mv.col),
+                                   SP(this_mv.as_mv.row),
+                                   above_src + 48, xd->dst.y_stride, &sse);
        score += sse;
      }
-#endif
    }
+#if !CONFIG_ABOVESPREFMV
    if (xd->left_available) {
      vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
                                   SP(this_mv.as_mv.col),
                                   SP(this_mv.as_mv.row),
                                   left_src, xd->dst.y_stride, &sse);
      score += sse;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,
                                     ref_y_stride,
                                     SP(this_mv.as_mv.col),
@ -219,36 +233,22 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                                     xd->dst.y_stride, &sse);
        score += sse;
      }
-#endif
-    }
-#else
-    row_offset = (this_mv.as_mv.row > 0) ?
-      ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
-    col_offset = (this_mv.as_mv.col > 0) ?
-      ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
-    offset = ref_y_stride * row_offset + col_offset;
-    score = 0;
-    if (xd->up_available) {
-      score += vp9_sad16x3(above_src, xd->dst.y_stride,
-                           above_ref + offset, ref_y_stride);
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        score += vp9_sad16x3(above_src + 16, xd->dst.y_stride,
-                             above_ref + offset + 16, ref_y_stride);
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     left_src + xd->dst.y_stride * 32,
+                                     xd->dst.y_stride, &sse);
+        score += sse;
+        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     left_src + xd->dst.y_stride * 48,
+                                     xd->dst.y_stride, &sse);
+        score += sse;
      }
-#endif
-    }
-    if (xd->left_available) {
-      score += vp9_sad3x16(left_src, xd->dst.y_stride,
-                           left_ref + offset, ref_y_stride);
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        score += vp9_sad3x16(left_src + xd->dst.y_stride * 16,
-                             xd->dst.y_stride,
-                             left_ref + offset + ref_y_stride * 16,
-                             ref_y_stride);
-      }
-#endif
    }
 #endif
    // Add the entry to our list and then resort the list on score.
@ -268,14 +268,11 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
  }

  // Make sure all the candidates are properly clamped etc
-  for (i = 0; i < 4; ++i) {
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
    lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);
    clamp_mv2(&sorted_mvs[i], xd);
  }

-  // Set the best mv to the first entry in the sorted list
-  best_mv->as_int = sorted_mvs[0].as_int;
-
  // Provided that there are non zero vectors available there will not
  // be more than one 0,0 entry in the sorted list.
  // The best ref mv is always set to the first entry (which gave the best
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@ -22,10 +22,9 @@
 * score to use as ref motion vector
 */
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           unsigned char *ref_y_buffer,
+                           uint8_t *ref_y_buffer,
                           int ref_y_stride,
                           int_mv *mvlist,
-                           int_mv *best_mv,
                           int_mv *nearest,
                           int_mv *near);

@ -82,7 +81,7 @@ vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
                           vp9_prob p[VP9_MVREFS - 1],
                           const int context);

-extern const unsigned char vp9_mbsplit_offset[4][16];
+extern const uint8_t vp9_mbsplit_offset[4][16];

 static int left_block_mv(const MODE_INFO *cur_mb, int b) {
  if (!(b & 3)) {
@ -182,4 +181,4 @@ static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
  return (cur_mb->bmi + b - 4)->as_mode.first;
 }

-#endif
+#endif  // VP9_COMMON_VP9_FINDNEARMV_H_
--- a/vp9/common/vp9_header.h
+++ b/vp9/common/vp9_header.h
@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_HEADER_H_
 #define VP9_COMMON_VP9_HEADER_H_

@ -38,5 +37,4 @@ typedef struct {
 #define VP9_HEADER_SIZE 3
 #endif

-
-#endif
+#endif  // VP9_COMMON_VP9_HEADER_H_
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
--- a/vp9/common/vp9_implicit_segmentation.c
+++ b/vp9/common/vp9_implicit_segmentation.c
@ -33,8 +33,8 @@ typedef struct {
  int min_y;
  int max_x;
  int max_y;
-  long long sum_x;
-  long long sum_y;
+  int64_t sum_x;
+  int64_t sum_y;
  int pixels;
  int seg_value;
  int label;
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@ -72,7 +72,7 @@ void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd) {
  vp9_inverse_transform_mbuv_4x4(xd);
 }

-void vp9_inverse_transform_b_8x8(short *input_dqcoeff, short *output_coeff,
+void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff,
                                 int pitch) {
  vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch);
 }
@ -125,8 +125,8 @@ void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd) {
  vp9_inverse_transform_mbuv_8x8(xd);
 }

-void vp9_inverse_transform_b_16x16(short *input_dqcoeff,
-                                   short *output_coeff, int pitch) {
+void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
+                                   int16_t *output_coeff, int pitch) {
  vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch);
 }

@ -145,3 +145,14 @@ void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) {
  vp9_inverse_transform_mby_16x16(xd);
  vp9_inverse_transform_mbuv_8x8(xd);
 }
+
+void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) {
+  vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64);
+}
+
+void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) {
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024,
+                                xd_sb->diff + 1024, 32);
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280,
+                                xd_sb->diff + 1280, 32);
+}
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@ -12,6 +12,7 @@
 #define VP9_COMMON_VP9_INVTRANS_H_

 #include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"

 extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);
@ -22,8 +23,8 @@ extern void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);

 extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_b_8x8(short *input_dqcoeff,
-                                        short *output_coeff, int pitch);
+extern void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,
+                                        int16_t *output_coeff, int pitch);

 extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);

@ -31,11 +32,14 @@ extern void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);

 extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_b_16x16(short *input_dqcoeff,
-                                          short *output_coeff, int pitch);
+extern void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
+                                          int16_t *output_coeff, int pitch);

 extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);

 extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);

-#endif  // __INC_INVTRANS_H
+extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);
+extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);
+
+#endif  // VP9_COMMON_VP9_INVTRANS_H_
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -176,46 +176,70 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm,
  }
 }

-void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
+// Determine if we should skip inner-MB loop filtering within a MB
+// The current condition is that the loop filtering is skipped only
+// the MB uses a prediction size of 16x16 and either 16x16 transform
+// is used or there is no residue at all.
+static int mb_lf_skip(const MB_MODE_INFO *const mbmi) {
+  const MB_PREDICTION_MODE mode = mbmi->mode;
+  const int skip_coef = mbmi->mb_skip_coeff;
+  const int tx_size = mbmi->txfm_size;
+  return mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV &&
+         (tx_size >= TX_16X16 || skip_coef);
+}
+
+// Determine if we should skip MB loop filtering on a MB edge within
+// a superblock, the current condition is that MB loop filtering is
+// skipped only when both MBs do not use inner MB loop filtering, and
+// same motion vector with same reference frame
+static int sb_mb_lf_skip(const MODE_INFO *const mip0,
+                         const MODE_INFO *const mip1) {
+  const MB_MODE_INFO *mbmi0 = &mip0->mbmi;
+  const MB_MODE_INFO *mbmi1 = &mip0->mbmi;
+  return mb_lf_skip(mbmi0) && mb_lf_skip(mbmi1) &&
+         (mbmi0->ref_frame == mbmi1->ref_frame) &&
+         (mbmi0->mv[mbmi0->ref_frame].as_int ==
+          mbmi1->mv[mbmi1->ref_frame].as_int) &&
+         mbmi0->ref_frame != INTRA_FRAME;
+}
+void vp9_loop_filter_frame(VP9_COMMON *cm,
+                           MACROBLOCKD *xd,
+                           int frame_filter_level,
+                           int y_only) {
  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
  loop_filter_info_n *lfi_n = &cm->lf_info;
  struct loop_filter_info lfi;
-
-  FRAME_TYPE frame_type = cm->frame_type;
-
-  int mb_row;
-  int mb_col;
-
-  int filter_level;
-
-  unsigned char *y_ptr, *u_ptr, *v_ptr;
+  const FRAME_TYPE frame_type = cm->frame_type;
+  int mb_row, mb_col;
+  uint8_t *y_ptr, *u_ptr, *v_ptr;

  /* Point at base of Mb MODE_INFO list */
  const MODE_INFO *mode_info_context = cm->mi;
+  const int mis = cm->mode_info_stride;

  /* Initialize the loop filter for this frame. */
-  vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
-
+  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
  /* Set up the buffer pointers */
  y_ptr = post->y_buffer;
-  u_ptr = post->u_buffer;
-  v_ptr = post->v_buffer;
+  if (y_only) {
+    u_ptr = 0;
+    v_ptr = 0;
+  } else {
+    u_ptr = post->u_buffer;
+    v_ptr = post->v_buffer;
+  }

  /* vp9_filter each macro block */
  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
-
-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+      const MB_PREDICTION_MODE mode = mode_info_context->mbmi.mode;
+      const int mode_index = lfi_n->mode_lf_lut[mode];
      const int seg = mode_info_context->mbmi.segment_id;
      const int ref_frame = mode_info_context->mbmi.ref_frame;
-      int tx_type = mode_info_context->mbmi.txfm_size;
-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
+      const int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
      if (filter_level) {
+        const int skip_lf = mb_lf_skip(&mode_info_context->mbmi);
+        const int tx_size = mode_info_context->mbmi.txfm_size;
        if (cm->filter_type == NORMAL_LOOPFILTER) {
          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
          lfi.mblim = lfi_n->mblim[filter_level];
@ -223,198 +247,102 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
          lfi.lim = lfi_n->lim[filter_level];
          lfi.hev_thr = lfi_n->hev_thr[hev_index];

-          if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                post->uv_stride, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                    post->uv_stride, &lfi);
+          if (mb_col > 0 &&
+              !((mb_col & 1) && mode_info_context->mbmi.sb_type &&
+                (sb_mb_lf_skip(mode_info_context - 1, mode_info_context) ||
+                 tx_size >= TX_32X32))
+              ) {
+            if (tx_size >= TX_16X16)
+              vp9_lpf_mbv_w(y_ptr, u_ptr, v_ptr, post->y_stride,
+                            post->uv_stride, &lfi);
            else
+              vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                  post->uv_stride, &lfi);
+          }
+          if (!skip_lf) {
+            if (tx_size >= TX_8X8) {
+              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
+                vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                      post->uv_stride, &lfi);
+              else
+                vp9_loop_filter_bv8x8(y_ptr, NULL, NULL, post->y_stride,
+                                      post->uv_stride, &lfi);
+            } else {
              vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
                                 post->uv_stride, &lfi);
+            }

          }
-
          /* don't apply across umv border */
-          if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                post->uv_stride, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                    post->uv_stride, &lfi);
+          if (mb_row > 0 &&
+              !((mb_row & 1) && mode_info_context->mbmi.sb_type &&
+                (sb_mb_lf_skip(mode_info_context - mis, mode_info_context) ||
+                tx_size >= TX_32X32))
+              ) {
+            if (tx_size >= TX_16X16)
+              vp9_lpf_mbh_w(y_ptr, u_ptr, v_ptr, post->y_stride,
+                            post->uv_stride, &lfi);
            else
+              vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                  post->uv_stride, &lfi);
+          }
+          if (!skip_lf) {
+            if (tx_size >= TX_8X8) {
+              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
+                vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                      post->uv_stride, &lfi);
+              else
+                vp9_loop_filter_bh8x8(y_ptr, NULL, NULL, post->y_stride,
+                                      post->uv_stride, &lfi);
+            } else {
              vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,
                                 post->uv_stride, &lfi);
+            }
          }
        } else {
          // FIXME: Not 8x8 aware
-          if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
-              )
+          if (mb_col > 0 &&
+              !(skip_lf && mb_lf_skip(&mode_info_context[-1].mbmi)) &&
+              !((mb_col & 1) && mode_info_context->mbmi.sb_type))
            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
                                       lfi_n->mblim[filter_level]);
-
          if (!skip_lf)
            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
                                      lfi_n->blim[filter_level]);

          /* don't apply across umv border */
-          if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
-              )
+          if (mb_row > 0 &&
+              !(skip_lf && mb_lf_skip(&mode_info_context[-mis].mbmi)) &&
+              !((mb_row & 1) && mode_info_context->mbmi.sb_type))
            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
                                       lfi_n->mblim[filter_level]);
-
          if (!skip_lf)
            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
                                      lfi_n->blim[filter_level]);
        }
      }
-
      y_ptr += 16;
-      u_ptr += 8;
-      v_ptr += 8;
-
+      if (!y_only) {
+        u_ptr += 8;
+        v_ptr += 8;
+      }
      mode_info_context++;     /* step to next MB */
    }
-
    y_ptr += post->y_stride  * 16 - post->y_width;
-    u_ptr += post->uv_stride *  8 - post->uv_width;
-    v_ptr += post->uv_stride *  8 - post->uv_width;
-
+    if (!y_only) {
+      u_ptr += post->uv_stride *  8 - post->uv_width;
+      v_ptr += post->uv_stride *  8 - post->uv_width;
+    }
    mode_info_context++;         /* Skip border mb */
  }
 }

-void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                 int default_filt_lvl) {
-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-
-  unsigned char *y_ptr;
-  int mb_row;
-  int mb_col;
-
-  loop_filter_info_n *lfi_n = &cm->lf_info;
-  struct loop_filter_info lfi;
-
-  int filter_level;
-  FRAME_TYPE frame_type = cm->frame_type;
-
-  /* Point at base of Mb MODE_INFO list */
-  const MODE_INFO *mode_info_context = cm->mi;
-
-#if 0
-  if (default_filt_lvl == 0) /* no filter applied */
-    return;
-#endif
-
-  /* Initialize the loop filter for this frame. */
-  vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);
-
-  /* Set up the buffer pointers */
-  y_ptr = post->y_buffer;
-
-  /* vp9_filter each macro block */
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
-
-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
-      const int seg = mode_info_context->mbmi.segment_id;
-      const int ref_frame = mode_info_context->mbmi.ref_frame;
-      int tx_type = mode_info_context->mbmi.txfm_size;
-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
-      if (filter_level) {
-        if (cm->filter_type == NORMAL_LOOPFILTER) {
-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-          lfi.mblim = lfi_n->mblim[filter_level];
-          lfi.blim = lfi_n->blim[filter_level];
-          lfi.lim = lfi_n->lim[filter_level];
-          lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
-          if (mb_col > 0)
-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-            else
-              vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-          }
-
-          /* don't apply across umv border */
-          if (mb_row > 0)
-            vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-            else
-              vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-          }
-        } else {
-          // FIXME: Not 8x8 aware
-          if (mb_col > 0)
-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-
-          /* don't apply across umv border */
-          if (mb_row > 0)
-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-        }
-      }
-
-      y_ptr += 16;
-      mode_info_context++;        /* step to next MB */
-    }
-
-    y_ptr += post->y_stride  * 16 - post->y_width;
-    mode_info_context++;            /* Skip border mb */
-  }
-}

 void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
                                   int default_filt_lvl) {
  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

-  unsigned char *y_ptr;
+  uint8_t *y_ptr;
  int mb_row;
  int mb_col;
  int mb_cols = post->y_width  >> 4;
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@ -49,26 +49,26 @@ struct loop_filter_info {
 };

 #define prototype_loopfilter(sym) \
-  void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+  void sym(uint8_t *src, int pitch, const unsigned char *blimit, \
           const unsigned char *limit, const unsigned char *thresh, int count)

 #define prototype_loopfilter_block(sym) \
-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+  void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
           int ystride, int uv_stride, struct loop_filter_info *lfi)

 #define prototype_simple_loopfilter(sym) \
-  void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+  void sym(uint8_t *y, int ystride, const unsigned char *blimit)

 #if ARCH_X86 || ARCH_X86_64
 #include "x86/vp9_loopfilter_x86.h"
 #endif

-typedef void loop_filter_uvfunction(unsigned char *u,   /* source pointer */
+typedef void loop_filter_uvfunction(uint8_t *u,   /* source pointer */
                                    int p,              /* pitch */
                                    const unsigned char *blimit,
                                    const unsigned char *limit,
                                    const unsigned char *thresh,
-                                    unsigned char *v);
+                                    uint8_t *v);

 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
@ -80,17 +80,27 @@ void vp9_loop_filter_frame_init(struct VP9Common *cm,
                                struct macroblockd *mbd,
                                int default_filt_lvl);

-void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);
+void vp9_loop_filter_frame(struct VP9Common *cm,
+                           struct macroblockd *mbd,
+                           int filter_level,
+                           int y_only);

 void vp9_loop_filter_partial_frame(struct VP9Common *cm,
                                   struct macroblockd *mbd,
                                   int default_filt_lvl);

-void vp9_loop_filter_frame_yonly(struct VP9Common *cm,
-                                 struct macroblockd *mbd,
-                                 int default_filt_lvl);
-
 void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
                                      int sharpness_lvl);

-#endif  // loopfilter_h
+void vp9_mb_lpf_horizontal_edge_w(unsigned char *s, int p,
+                                  const unsigned char *blimit,
+                                  const unsigned char *limit,
+                                  const unsigned char *thresh,
+                                  int count);
+
+void vp9_mb_lpf_vertical_edge_w(unsigned char *s, int p,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh,
+                                int count);
+#endif  // VP9_COMMON_VP9_LOOPFILTER_H_
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@ -13,20 +13,20 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"

-typedef unsigned char uc;
-
-static __inline signed char signed_char_clamp(int t) {
+static __inline int8_t signed_char_clamp(int t) {
  t = (t < -128 ? -128 : t);
  t = (t > 127 ? 127 : t);
-  return (signed char) t;
+  return (int8_t) t;
 }


 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char filter_mask(uc limit, uc blimit,
-                                        uc p3, uc p2, uc p1, uc p0,
-                                        uc q0, uc q1, uc q2, uc q3) {
-  signed char mask = 0;
+static __inline int8_t filter_mask(uint8_t limit, uint8_t blimit,
+                                   uint8_t p3, uint8_t p2,
+                                   uint8_t p1, uint8_t p0,
+                                   uint8_t q0, uint8_t q1,
+                                   uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
  mask |= (abs(p3 - p2) > limit) * -1;
  mask |= (abs(p2 - p1) > limit) * -1;
  mask |= (abs(p1 - p0) > limit) * -1;
@ -39,26 +39,25 @@ static __inline signed char filter_mask(uc limit, uc blimit,
 }

 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {
-  signed char hev = 0;
+static __inline int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                               uint8_t q0, uint8_t q1) {
+  int8_t hev = 0;
  hev  |= (abs(p1 - p0) > thresh) * -1;
  hev  |= (abs(q1 - q0) > thresh) * -1;
  return hev;
 }

-static __inline void filter(signed char mask, uc hev, uc *op1,
-                            uc *op0, uc *oq0, uc *oq1)
+static __inline void filter(int8_t mask, uint8_t hev, uint8_t *op1,
+                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+  int8_t ps0, qs0;
+  int8_t ps1, qs1;
+  int8_t filter, Filter1, Filter2;
+  int8_t u;

-{
-  signed char ps0, qs0;
-  signed char ps1, qs1;
-  signed char filter, Filter1, Filter2;
-  signed char u;
-
-  ps1 = (signed char) * op1 ^ 0x80;
-  ps0 = (signed char) * op0 ^ 0x80;
-  qs0 = (signed char) * oq0 ^ 0x80;
-  qs1 = (signed char) * oq1 ^ 0x80;
+  ps1 = (int8_t) *op1 ^ 0x80;
+  ps0 = (int8_t) *op0 ^ 0x80;
+  qs0 = (int8_t) *oq0 ^ 0x80;
+  qs1 = (int8_t) *oq1 ^ 0x80;

  /* add outer taps if we have high edge variance */
  filter = signed_char_clamp(ps1 - qs1);
@ -91,20 +90,16 @@ static __inline void filter(signed char mask, uc hev, uc *op1,
  *oq1 = u ^ 0x80;
  u = signed_char_clamp(ps1 + filter);
  *op1 = u ^ 0x80;
-
 }

-void vp9_loop_filter_horizontal_edge_c
-(
-  unsigned char *s,
-  int p, /* pitch */
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  int  hev = 0; /* high edge variance */
-  signed char mask = 0;
+void vp9_loop_filter_horizontal_edge_c(uint8_t *s,
+                                       int p, /* pitch */
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh,
+                                       int count) {
+  int hev = 0; /* high edge variance */
+  int8_t mask = 0;
  int i = 0;

  /* loop filter designed to work using chars so that we can make maximum use
@ -123,14 +118,14 @@ void vp9_loop_filter_horizontal_edge_c
  } while (++i < count * 8);
 }

-void vp9_loop_filter_vertical_edge_c(unsigned char *s,
+void vp9_loop_filter_vertical_edge_c(uint8_t *s,
                                     int p,
                                     const unsigned char *blimit,
                                     const unsigned char *limit,
                                     const unsigned char *thresh,
                                     int count) {
  int  hev = 0; /* high edge variance */
-  signed char mask = 0;
+  int8_t mask = 0;
  int i = 0;

  /* loop filter designed to work using chars so that we can make maximum use
@ -148,27 +143,409 @@ void vp9_loop_filter_vertical_edge_c(unsigned char *s,
    s += p;
  } while (++i < count * 8);
 }
-static __inline signed char flatmask(uc thresh,
-                                     uc p4, uc p3, uc p2, uc p1, uc p0,
-                                     uc q0, uc q1, uc q2, uc q3, uc q4) {
-  signed char flat = 0;
-  flat |= (abs(p1 - p0) > 1) * -1;
-  flat |= (abs(q1 - q0) > 1) * -1;
-  flat |= (abs(p0 - p2) > 1) * -1;
-  flat |= (abs(q0 - q2) > 1) * -1;
-  flat |= (abs(p3 - p0) > 1) * -1;
-  flat |= (abs(q3 - q0) > 1) * -1;
-  flat |= (abs(p4 - p0) > 1) * -1;
-  flat |= (abs(q4 - q0) > 1) * -1;
+static __inline signed char flatmask(uint8_t thresh,
+                                     uint8_t p4, uint8_t p3, uint8_t p2,
+                                     uint8_t p1, uint8_t p0,
+                                     uint8_t q0, uint8_t q1, uint8_t q2,
+                                     uint8_t q3, uint8_t q4) {
+  int8_t flat = 0;
+  flat |= (abs(p1 - p0) > thresh) * -1;
+  flat |= (abs(q1 - q0) > thresh) * -1;
+  flat |= (abs(p0 - p2) > thresh) * -1;
+  flat |= (abs(q0 - q2) > thresh) * -1;
+  flat |= (abs(p3 - p0) > thresh) * -1;
+  flat |= (abs(q3 - q0) > thresh) * -1;
+  flat |= (abs(p4 - p0) > thresh) * -1;
+  flat |= (abs(q4 - q0) > thresh) * -1;
  flat = ~flat;
  return flat;
 }

-static __inline void mbfilter(signed char mask, uc hev, uc flat,
-                              uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,
-                              uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {
+static __inline void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
+                              uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                              uint8_t *op1, uint8_t *op0,
+                              uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
+                              uint8_t *oq3, uint8_t *oq4) {
  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
  if (flat && mask) {
+    uint8_t p0, q0;
+    uint8_t p1, q1;
+    uint8_t p2, q2;
+    uint8_t p3, q3;
+    uint8_t p4, q4;
+
+    p4 = *op4;
+    p3 = *op3;
+    p2 = *op2;
+    p1 = *op1;
+    p0 = *op0;
+    q0 = *oq0;
+    q1 = *oq1;
+    q2 = *oq2;
+    q3 = *oq3;
+    q4 = *oq4;
+
+    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
+    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
+    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
+    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
+  } else {
+    int8_t ps0, qs0;
+    int8_t ps1, qs1;
+    int8_t filter, Filter1, Filter2;
+    int8_t u;
+
+    ps1 = (int8_t) *op1 ^ 0x80;
+    ps0 = (int8_t) *op0 ^ 0x80;
+    qs0 = (int8_t) *oq0 ^ 0x80;
+    qs1 = (int8_t) *oq1 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    filter = signed_char_clamp(ps1 - qs1);
+    filter &= hev;
+
+    /* inner taps */
+    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
+    filter &= mask;
+
+    Filter1 = signed_char_clamp(filter + 4);
+    Filter2 = signed_char_clamp(filter + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+
+    u = signed_char_clamp(qs0 - Filter1);
+    *oq0 = u ^ 0x80;
+    u = signed_char_clamp(ps0 + Filter2);
+    *op0 = u ^ 0x80;
+    filter = Filter1;
+
+    /* outer tap adjustments */
+    filter += 1;
+    filter >>= 1;
+    filter &= ~hev;
+
+    u = signed_char_clamp(qs1 - filter);
+    *oq1 = u ^ 0x80;
+    u = signed_char_clamp(ps1 + filter);
+    *op1 = u ^ 0x80;
+  }
+}
+
+void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s,
+                                         int p,
+                                         const unsigned char *blimit,
+                                         const unsigned char *limit,
+                                         const unsigned char *thresh,
+                                         int count) {
+  int8_t hev = 0; /* high edge variance */
+  int8_t mask = 0;
+  int8_t flat = 0;
+  int i = 0;
+
+  /* loop filter designed to work using chars so that we can make maximum use
+   * of 8 bit simd instructions.
+   */
+  do {
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+
+    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+    flat = flatmask(1,
+                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
+    mbfilter(mask, hev, flat,
+             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);
+
+    ++s;
+  } while (++i < count * 8);
+
+}
+
+void vp9_mbloop_filter_vertical_edge_c(uint8_t *s,
+                                       int p,
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh,
+                                       int count) {
+  int8_t hev = 0; /* high edge variance */
+  int8_t mask = 0;
+  int8_t flat = 0;
+  int i = 0;
+
+  do {
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4], s[-3], s[-2], s[-1],
+                       s[0], s[1], s[2], s[3]);
+
+    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+    flat = flatmask(1,
+                    s[-5], s[-4], s[-3], s[-2], s[-1],
+                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
+    mbfilter(mask, hev, flat,
+             s - 5, s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3, s + 4);
+    s += p;
+  } while (++i < count * 8);
+
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static __inline int8_t simple_filter_mask(uint8_t blimit,
+                                          uint8_t p1, uint8_t p0,
+                                          uint8_t q0, uint8_t q1) {
+  /* Why does this cause problems for win32?
+   * error C2143: syntax error : missing ';' before 'type'
+   *  (void) limit;
+   */
+  int8_t mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
+  return mask;
+}
+
+static __inline void simple_filter(int8_t mask,
+                                   uint8_t *op1, uint8_t *op0,
+                                   uint8_t *oq0, uint8_t *oq1) {
+  int8_t filter, Filter1, Filter2;
+  int8_t p1 = (int8_t) *op1 ^ 0x80;
+  int8_t p0 = (int8_t) *op0 ^ 0x80;
+  int8_t q0 = (int8_t) *oq0 ^ 0x80;
+  int8_t q1 = (int8_t) *oq1 ^ 0x80;
+  int8_t u;
+
+  filter = signed_char_clamp(p1 - q1);
+  filter = signed_char_clamp(filter + 3 * (q0 - p0));
+  filter &= mask;
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  Filter1 = signed_char_clamp(filter + 4);
+  Filter1 >>= 3;
+  u = signed_char_clamp(q0 - Filter1);
+  *oq0  = u ^ 0x80;
+
+  Filter2 = signed_char_clamp(filter + 3);
+  Filter2 >>= 3;
+  u = signed_char_clamp(p0 + Filter2);
+  *op0 = u ^ 0x80;
+}
+
+void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s,
+                                              int p,
+                                              const unsigned char *blimit) {
+  int8_t mask = 0;
+  int i = 0;
+
+  do {
+    mask = simple_filter_mask(blimit[0],
+                              s[-2 * p], s[-1 * p],
+                              s[0 * p], s[1 * p]);
+    simple_filter(mask,
+                  s - 2 * p, s - 1 * p,
+                  s, s + 1 * p);
+    ++s;
+  } while (++i < 16);
+}
+
+void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s,
+                                            int p,
+                                            const unsigned char *blimit) {
+  int8_t mask = 0;
+  int i = 0;
+
+  do {
+    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
+    simple_filter(mask, s - 2, s - 1, s, s + 1);
+    s += p;
+  } while (++i < 16);
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                           uint8_t *v_ptr, int y_stride, int uv_stride,
+                           struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_c(uint8_t*y_ptr, uint8_t *u_ptr,
+                          uint8_t *v_ptr, int y_stride, int uv_stride,
+                          struct loop_filter_info *lfi) {
+  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,
+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,
+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                           uint8_t *v_ptr, int y_stride, int uv_stride,
+                           struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                          uint8_t *v_ptr, int y_stride, int uv_stride,
+                          struct loop_filter_info *lfi) {
+  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bh8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                             uint8_t *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_horizontal_edge_c(
+    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bhs_c(uint8_t *y_ptr, int y_stride,
+                           const unsigned char *blimit) {
+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
+                                           y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,
+                                           y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,
+                                           y_stride, blimit);
+}
+
+void vp9_loop_filter_bv8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                             uint8_t *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_vertical_edge_c(
+    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bvs_c(uint8_t *y_ptr, int y_stride,
+                           const unsigned char *blimit) {
+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
+
+static __inline void wide_mbfilter(int8_t mask, uint8_t hev,
+                                   uint8_t flat, uint8_t flat2,
+                                   uint8_t *op7, uint8_t *op6, uint8_t *op5,
+                                   uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                                   uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                                   uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                                   uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,
+                                   uint8_t *oq7) {
+  /* use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line */
+  if (flat2 && flat && mask) {
+    uint8_t p0, q0;
+    uint8_t p1, q1;
+    uint8_t p2, q2;
+    uint8_t p3, q3;
+    uint8_t p4, q4;
+    uint8_t p5, q5;
+    uint8_t p6, q6;
+    uint8_t p7, q7;
+
+    p7 = *op7;
+    p6 = *op6;
+    p5 = *op5;
+    p4 = *op4;
+    p3 = *op3;
+    p2 = *op2;
+    p1 = *op1;
+    p0 = *op0;
+    q0 = *oq0;
+    q1 = *oq1;
+    q2 = *oq2;
+    q3 = *oq3;
+    q4 = *oq4;
+    q5 = *oq5;
+    q6 = *oq6;
+    q7 = *oq7;
+
+    *op6 = (p7 * 7 + p6 * 2 +
+            p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+    *op5 = (p7 * 6 + p6 + p5 * 2 +
+            p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+    *op4 = (p7 * 5 + p6 + p5 + p4 * 2 +
+            p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+    *op3 = (p7 * 4 + p6 + p5 + p4 + p3 * 2 +
+            p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+    *op2 = (p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 +
+            p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+    *op1 = (p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+            p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+    *op0 = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+            q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+    *oq0 = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+            q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
+    *oq1 = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+            q2 + q3 + q4 + q5 + q6 + q7 * 2 + 8) >> 4;
+    *oq2 = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+            q3 + q4 + q5 + q6 + q7 * 3 + 8) >> 4;
+    *oq3 = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
+            q4 + q5 + q6 + q7 * 4 + 8) >> 4;
+    *oq4 = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
+            q5 + q6 + q7 * 5 + 8) >> 4;
+    *oq5 = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
+            q6 + q7 * 6 + 8) >> 4;
+    *oq6 = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
+            q7 * 7 + 8) >> 4;
+  } else if (flat && mask) {
    unsigned char p0, q0;
    unsigned char p1, q1;
    unsigned char p2, q2;
@ -233,7 +610,8 @@ static __inline void mbfilter(signed char mask, uc hev, uc flat,
    *op1 = u ^ 0x80;
  }
 }
-void vp9_mbloop_filter_horizontal_edge_c
+
+void vp9_mb_lpf_horizontal_edge_w
 (
  unsigned char *s,
  int p,
@ -245,31 +623,37 @@ void vp9_mbloop_filter_horizontal_edge_c
  signed char hev = 0; /* high edge variance */
  signed char mask = 0;
  signed char flat = 0;
+  signed char flat2 = 0;
  int i = 0;

  /* loop filter designed to work using chars so that we can make maximum use
   * of 8 bit simd instructions.
   */
  do {
-
    mask = filter_mask(limit[0], blimit[0],
                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

-    flat = flatmask(thresh[0],
+    flat = flatmask(1,
                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
-    mbfilter(mask, hev, flat,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);
+
+    flat2 = flatmask(1,
+                    s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
+                    s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
+
+    wide_mbfilter(mask, hev, flat, flat2,
+             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,         s + 1 * p, s + 2 * p, s + 3 * p,
+             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);

    ++s;
  } while (++i < count * 8);
-
 }
-void vp9_mbloop_filter_vertical_edge_c
+void vp9_mb_lpf_vertical_edge_w
 (
  unsigned char *s,
  int p,
@ -281,106 +665,35 @@ void vp9_mbloop_filter_vertical_edge_c
  signed char hev = 0; /* high edge variance */
  signed char mask = 0;
  signed char flat = 0;
+  signed char flat2 = 0;
  int i = 0;

  do {
-
    mask = filter_mask(limit[0], blimit[0],
                       s[-4], s[-3], s[-2], s[-1],
                       s[0], s[1], s[2], s[3]);

    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    flat = flatmask(thresh[0],
+    flat = flatmask(1,
                    s[-5], s[-4], s[-3], s[-2], s[-1],
                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
-    mbfilter(mask, hev, flat,
-             s - 5, s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3, s + 4);
+    flat2 = flatmask(1,
+                    s[-8], s[-7], s[-6], s[-5], s[-1],
+                    s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
+
+    wide_mbfilter(mask, hev, flat, flat2,
+             s - 8, s - 7, s - 6, s - 5,
+             s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3,
+             s + 4, s + 5, s + 6, s + 7);
    s += p;
  } while (++i < count * 8);
-
 }

-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char simple_filter_mask(uc blimit,
-                                               uc p1, uc p0,
-                                               uc q0, uc q1) {
-  /* Why does this cause problems for win32?
-   * error C2143: syntax error : missing ';' before 'type'
-   *  (void) limit;
-   */
-  signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
-  return mask;
-}
-
-static __inline void simple_filter(signed char mask,
-                                   uc *op1, uc *op0,
-                                   uc *oq0, uc *oq1) {
-  signed char filter, Filter1, Filter2;
-  signed char p1 = (signed char) * op1 ^ 0x80;
-  signed char p0 = (signed char) * op0 ^ 0x80;
-  signed char q0 = (signed char) * oq0 ^ 0x80;
-  signed char q1 = (signed char) * oq1 ^ 0x80;
-  signed char u;
-
-  filter = signed_char_clamp(p1 - q1);
-  filter = signed_char_clamp(filter + 3 * (q0 - p0));
-  filter &= mask;
-
-  /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  Filter1 = signed_char_clamp(filter + 4);
-  Filter1 >>= 3;
-  u = signed_char_clamp(q0 - Filter1);
-  *oq0  = u ^ 0x80;
-
-  Filter2 = signed_char_clamp(filter + 3);
-  Filter2 >>= 3;
-  u = signed_char_clamp(p0 + Filter2);
-  *op0 = u ^ 0x80;
-}
-
-void vp9_loop_filter_simple_horizontal_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit
-) {
-  signed char mask = 0;
-  int i = 0;
-
-  do {
-    mask = simple_filter_mask(blimit[0],
-                              s[-2 * p], s[-1 * p],
-                              s[0 * p], s[1 * p]);
-    simple_filter(mask,
-                  s - 2 * p, s - 1 * p,
-                  s, s + 1 * p);
-    ++s;
-  } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit
-) {
-  signed char mask = 0;
-  int i = 0;
-
-  do {
-    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
-  } while (++i < 16);
-
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
-                           struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
+void vp9_lpf_mbv_w_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                   unsigned char *v_ptr, int y_stride, int uv_stride,
+                   struct loop_filter_info *lfi) {
+  vp9_mb_lpf_vertical_edge_w(y_ptr, y_stride,
                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);

  if (u_ptr)
@ -391,32 +704,10 @@ void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
-                          struct loop_filter_info *lfi) {
-  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+void vp9_lpf_mbh_w_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
+  vp9_mb_lpf_horizontal_edge_w(y_ptr, y_stride,
                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);

  if (u_ptr)
@ -428,53 +719,3 @@ void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }

-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
-                          struct loop_filter_info *lfi) {
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c(
-    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
-                           const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
-                                           y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,
-                                           y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,
-                                           y_stride, blimit);
-}
-
-void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c(
-    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
-                           const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
-}
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@ -16,17 +16,13 @@ typedef enum {
  DEST = 1
 } BLOCKSET;

-static void setup_block
-(
-  BLOCKD *b,
-  int mv_stride,
-  unsigned char **base,
-  unsigned char **base2,
-  int Stride,
-  int offset,
-  BLOCKSET bs
-) {
-
+static void setup_block(BLOCKD *b,
+                        int mv_stride,
+                        uint8_t **base,
+                        uint8_t **base2,
+                        int Stride,
+                        int offset,
+                        BLOCKSET bs) {
  if (bs == DEST) {
    b->dst_stride = Stride;
    b->dst = offset;
@ -37,15 +33,13 @@ static void setup_block
    b->base_pre = base;
    b->base_second_pre = base2;
  }
-
 }

-
 static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
  int block;

-  unsigned char **y, **u, **v;
-  unsigned char **y2 = NULL, **u2 = NULL, **v2 = NULL;
+  uint8_t **y, **u, **v;
+  uint8_t **y2 = NULL, **u2 = NULL, **v2 = NULL;
  BLOCKD *blockd = xd->block;
  int stride;

@ -117,7 +111,6 @@ void vp9_setup_block_dptrs(MACROBLOCKD *xd) {
 }

 void vp9_build_block_doffsets(MACROBLOCKD *xd) {
-
  /* handle the destination pitch features */
  setup_macroblock(xd, DEST);
  setup_macroblock(xd, PRED);
--- a/vp9/common/vp9_modecont.h
+++ b/vp9/common/vp9_modecont.h
@ -8,9 +8,9 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_MODECONT_H_
 #define VP9_COMMON_VP9_MODECONT_H_

 extern const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4];
-#endif
+
+#endif  // VP9_COMMON_VP9_MODECONT_H_
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@ -8,14 +8,14 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_MV_H_
 #define VP9_COMMON_VP9_MV_H_
+
 #include "vpx/vpx_integer.h"

 typedef struct {
-  short row;
-  short col;
+  int16_t row;
+  int16_t col;
 } MV;

 typedef union int_mv {
@ -23,4 +23,4 @@ typedef union int_mv {
  MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */

-#endif
+#endif  // VP9_COMMON_VP9_MV_H_
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@ -17,14 +17,13 @@ static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
 };
 static int mb_ref_distance_weight[MVREF_NEIGHBOURS] =
  { 3, 3, 2, 1, 1, 1, 1, 1 };
-#if CONFIG_SUPERBLOCKS
 static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
    {0, -1}, {-1, 0}, {1, -1}, {-1, 1},
    {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}
 };
 static int sb_ref_distance_weight[MVREF_NEIGHBOURS] =
  { 3, 3, 2, 2, 2, 1, 1, 1 };
-#endif
+
 // clamp_mv
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
 static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
@ -40,10 +39,29 @@ static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
    mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
 }

+// Gets a candidate refenence motion vector from the given mode info
+// structure if one exists that matches the given reference frame.
+static int get_matching_candidate(
+  const MODE_INFO *candidate_mi,
+  MV_REFERENCE_FRAME ref_frame,
+  int_mv *c_mv
+) {
+  int ret_val = TRUE;

-// Gets a best matching candidate refenence motion vector
-// from the given mode info structure (if available)
-static int get_candidate_mvref(
+  if (ref_frame == candidate_mi->mbmi.ref_frame) {
+    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
+    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+  } else {
+    ret_val = FALSE;
+  }
+
+  return ret_val;
+}
+
+// Gets candidate refenence motion vector(s) from the given mode info
+// structure if they exists and do NOT match the given reference frame.
+static void get_non_matching_candidates(
  const MODE_INFO *candidate_mi,
  MV_REFERENCE_FRAME ref_frame,
  MV_REFERENCE_FRAME *c_ref_frame,
@ -52,61 +70,29 @@ static int get_candidate_mvref(
  int_mv *c2_mv
 ) {

-  int ret_val = FALSE;
+  c_mv->as_int = 0;
  c2_mv->as_int = 0;
+  *c_ref_frame = INTRA_FRAME;
  *c2_ref_frame = INTRA_FRAME;

-  // Target ref frame matches candidate first ref frame
-  if (ref_frame == candidate_mi->mbmi.ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    *c_ref_frame = ref_frame;
-    ret_val = TRUE;
+  // If first candidate not valid neither will be.
+  if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) {
+    // First candidate
+    if (candidate_mi->mbmi.ref_frame != ref_frame) {
+      *c_ref_frame = candidate_mi->mbmi.ref_frame;
+      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+    }

-    // Is there a second non zero vector we can use.
+    // Second candidate
    if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[1].as_int != 0) &&
-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+        (candidate_mi->mbmi.second_ref_frame != ref_frame)) {  // &&
+        // (candidate_mi->mbmi.mv[1].as_int != 0) &&
+        // (candidate_mi->mbmi.mv[1].as_int !=
+        // candidate_mi->mbmi.mv[0].as_int)) {
      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
-    }
-
-  // Target ref frame matches candidate second ref frame
-  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    *c_ref_frame = ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
-    if ((candidate_mi->mbmi.ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[0].as_int != 0) &&
-        (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-      *c2_ref_frame = candidate_mi->mbmi.ref_frame;
-    }
-
-  // No ref frame matches so use first ref mv as first choice
-  } else if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) {
-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    *c_ref_frame = candidate_mi->mbmi.ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
-    if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[1].as_int != 0) &&
-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
    }
-
-  // If only the second ref mv is valid:- (Should not trigger in current code
-  // base given current possible compound prediction options).
-  } else if (candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) {
-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    *c_ref_frame = candidate_mi->mbmi.second_ref_frame;
-    ret_val = TRUE;
  }
-
-  return ret_val;
 }

 // Performs mv adjustment based on reference frame and clamps the MV
@ -170,14 +156,20 @@ static void addmv_and_shuffle(
  int weight
 ) {

-  int i = *index;
+  int i;
+  int insert_point;
  int duplicate_found = FALSE;

-  // Check for duplicates. If there is one increment its score.
-  // Duplicate defined as being the same full pel vector with rounding.
+  // Check for duplicates. If there is one increase its score.
+  // We only compare vs the current top candidates.
+  insert_point = (*index < (MAX_MV_REF_CANDIDATES - 1))
+                 ? *index : (MAX_MV_REF_CANDIDATES - 1);
+
+  i = insert_point;
+  if (*index > i)
+    i++;
  while (i > 0) {
    i--;
-
    if (candidate_mv.as_int == mv_list[i].as_int) {
      duplicate_found = TRUE;
      mv_scores[i] += weight;
@ -185,11 +177,13 @@ static void addmv_and_shuffle(
    }
  }

-  // If no duplicate was found add the new vector and give it a weight
-  if (!duplicate_found) {
-    mv_list[*index].as_int = candidate_mv.as_int;
-    mv_scores[*index] = weight;
-    i = *index;
+  // If no duplicate and the new candidate is good enough then add it.
+  if (!duplicate_found ) {
+    if (weight > mv_scores[insert_point]) {
+      mv_list[insert_point].as_int = candidate_mv.as_int;
+      mv_scores[insert_point] = weight;
+      i = insert_point;
+    }
    (*index)++;
  }

@ -224,38 +218,32 @@ void vp9_find_mv_refs(
  int i;
  MODE_INFO *candidate_mi;
  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int_mv candidate_mvs[MAX_MV_REFS];
+  int_mv candidate_mvs[MAX_MV_REF_CANDIDATES];
  int_mv c_refmv;
-  MV_REFERENCE_FRAME c_ref_frame;
  int_mv c2_refmv;
+  MV_REFERENCE_FRAME c_ref_frame;
  MV_REFERENCE_FRAME c2_ref_frame;
-  int candidate_scores[MAX_MV_REFS];
+  int candidate_scores[MAX_MV_REF_CANDIDATES];
  int index = 0;
  int split_count = 0;
-  int ref_weight = 0;
-  int valid_mv_ref;
  int (*mv_ref_search)[2];
  int *ref_distance_weight;

  // Blank the reference vector lists and other local structures.
-  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
-  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
+  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
+  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));

-#if CONFIG_SUPERBLOCKS
-  if (mbmi->encoded_as_sb) {
+  if (mbmi->sb_type) {
    mv_ref_search = sb_mv_ref_search;
    ref_distance_weight = sb_ref_distance_weight;
  } else {
    mv_ref_search = mb_mv_ref_search;
    ref_distance_weight = mb_ref_distance_weight;
  }
-#else
-  mv_ref_search = mb_mv_ref_search;
-  ref_distance_weight = mb_ref_distance_weight;
-#endif
-  // Populate a list with candidate reference vectors from the
-  // spatial neighbours.
+
+  // We first scan for candidate vectors that match the current reference frame
+  // Look at nearest neigbours
  for (i = 0; i < 2; ++i) {
    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
@ -263,95 +251,89 @@ void vp9_find_mv_refs(
      candidate_mi = here + mv_ref_search[i][0] +
                     (mv_ref_search[i][1] * xd->mode_info_stride);

-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                         &c_ref_frame, &c_refmv,
-                                         &c2_ref_frame, &c2_refmv);
-
-      // If there is a valid MV candidate then add it to the list
-      if (valid_mv_ref) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-        ref_weight = ref_distance_weight[i] +
-                     ((c_ref_frame == ref_frame) << 4);
-        split_count += (candidate_mi->mbmi.mode == SPLITMV);
-
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
+        clamp_mv(xd, &c_refmv);
        addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_weight);
-
-        // If there is a second valid mv then add it as well.
-        if (c2_ref_frame > INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-          ref_weight = ref_distance_weight[i] +
-                       ((c2_ref_frame == ref_frame) << 4);
-
-          addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c2_refmv, ref_weight);
-        }
+                          &index, c_refmv, ref_distance_weight[i] + 16);
      }
+      split_count += (candidate_mi->mbmi.mode == SPLITMV);
    }
  }
-
-  // Look at the corresponding vector in the last frame
+  // Look in the last frame
  candidate_mi = lf_here;
-  valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                     &c_ref_frame, &c_refmv,
-                                     &c2_ref_frame, &c2_refmv);
-
-  // If there is a valid MV candidate then add it to the list
-  if (valid_mv_ref) {
-    scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-    ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);
+  if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
+    clamp_mv(xd, &c_refmv);
    addmv_and_shuffle(candidate_mvs, candidate_scores,
-                      &index, c_refmv, ref_weight);
-
-    // If there is a second valid mv then add it as well.
-    if (c2_ref_frame > INTRA_FRAME) {
-      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-      ref_weight = ref_distance_weight[i] +
-                   ((c2_ref_frame == ref_frame) << 4);
-
-      addmv_and_shuffle(candidate_mvs, candidate_scores,
-                        &index, c2_refmv, ref_weight);
-    }
+                      &index, c_refmv, 18);
  }
-
-  // Populate a list with candidate reference vectors from the
-  // spatial neighbours.
-  for (i = 2; (i < MVREF_NEIGHBOURS) && (index < (MAX_MV_REFS - 2)); ++i) {
+  // More distant neigbours
+  for (i = 2; (i < MVREF_NEIGHBOURS) &&
+              (index < (MAX_MV_REF_CANDIDATES - 1)); ++i) {
    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
-
      candidate_mi = here + mv_ref_search[i][0] +
                     (mv_ref_search[i][1] * xd->mode_info_stride);

-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                         &c_ref_frame, &c_refmv,
-                                         &c2_ref_frame, &c2_refmv);
-
-      // If there is a valid MV candidate then add it to the list
-      if (valid_mv_ref) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-        ref_weight = ref_distance_weight[i] +
-                     ((c_ref_frame == ref_frame) << 4);
-
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
+        clamp_mv(xd, &c_refmv);
        addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_weight);
-
-        // If there is a second valid mv then add it as well.
-        if (c2_ref_frame > INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-          ref_weight = ref_distance_weight[i] +
-                       ((c2_ref_frame == ref_frame) << 4);
-
-          addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c2_refmv, ref_weight);
-        }
+                          &index, c_refmv, ref_distance_weight[i] + 16);
      }
    }
  }

-  // Make sure we are able to add 0,0
-  if (index > (MAX_MV_REFS - 1)) {
-    index = (MAX_MV_REFS - 1);
+  // If we have not found enough candidates consider ones where the
+  // reference frame does not match. Break out when we have
+  // MAX_MV_REF_CANDIDATES candidates.
+  // Look first at spatial neighbours
+  if (index < (MAX_MV_REF_CANDIDATES - 1)) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+          ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+        candidate_mi = here + mv_ref_search[i][0] +
+                       (mv_ref_search[i][1] * xd->mode_info_stride);
+
+        get_non_matching_candidates(candidate_mi, ref_frame,
+                                    &c_ref_frame, &c_refmv,
+                                    &c2_ref_frame, &c2_refmv);
+
+        if (c_ref_frame != INTRA_FRAME) {
+          scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
+          addmv_and_shuffle(candidate_mvs, candidate_scores,
+                            &index, c_refmv, ref_distance_weight[i]);
+        }
+
+        if (c2_ref_frame != INTRA_FRAME) {
+          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
+          addmv_and_shuffle(candidate_mvs, candidate_scores,
+                            &index, c2_refmv, ref_distance_weight[i]);
+        }
+      }
+
+      if (index >= (MAX_MV_REF_CANDIDATES - 1)) {
+        break;
+      }
+    }
+  }
+  // Look at the last frame
+  if (index < (MAX_MV_REF_CANDIDATES - 1)) {
+    candidate_mi = lf_here;
+    get_non_matching_candidates(candidate_mi, ref_frame,
+                                &c_ref_frame, &c_refmv,
+                                &c2_ref_frame, &c2_refmv);
+
+    if (c_ref_frame != INTRA_FRAME) {
+      scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
+      addmv_and_shuffle(candidate_mvs, candidate_scores,
+                        &index, c_refmv, 2);
+    }
+
+    if (c2_ref_frame != INTRA_FRAME) {
+      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
+      addmv_and_shuffle(candidate_mvs, candidate_scores,
+                        &index, c2_refmv, 2);
+    }
  }

  // Define inter mode coding context.
@ -383,14 +365,12 @@ void vp9_find_mv_refs(
  }

  // 0,0 is always a valid reference.
-  for (i = 0; i < index; ++i) {
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
    if (candidate_mvs[i].as_int == 0)
      break;
  }
-  if (i == index) {
-    c_refmv.as_int = 0;
-    addmv_and_shuffle(candidate_mvs, candidate_scores,
-                      &index, c_refmv, candidate_scores[3]+1 );
+  if (i == MAX_MV_REF_CANDIDATES) {
+    candidate_mvs[MAX_MV_REF_CANDIDATES-1].as_int = 0;
  }

  // Copy over the candidate list.
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@ -11,18 +11,14 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"

-
 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
 #define VP9_COMMON_VP9_MVREF_COMMON_H_

-void vp9_find_mv_refs(
-  MACROBLOCKD *xd,
-  MODE_INFO *here,
-  MODE_INFO *lf_here,
-  MV_REFERENCE_FRAME ref_frame,
-  int_mv * mv_ref_list,
-  int *ref_sign_bias
-);
-
-#endif
+void vp9_find_mv_refs(MACROBLOCKD *xd,
+                      MODE_INFO *here,
+                      MODE_INFO *lf_here,
+                      MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int *ref_sign_bias);

+#endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_ONYX_H_
 #define VP9_COMMON_VP9_ONYX_H_

@ -20,7 +19,6 @@ extern "C"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_ppflags.h"
  typedef int *VP9_PTR;

@ -222,4 +220,4 @@ extern "C"
 }
 #endif

-#endif  // __INC_ONYX_H
+#endif  // VP9_COMMON_VP9_ONYX_H_
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_ONYXC_INT_H_
 #define VP9_COMMON_VP9_ONYXC_INT_H_

@ -45,70 +44,51 @@ void vp9_initialize_common(void);
 typedef struct frame_contexts {
  vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
  vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-#if CONFIG_SUPERBLOCKS
  vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
-#endif
  vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
  vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-  vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];

  nmv_context nmvc;
  nmv_context pre_nmvc;
  vp9_prob pre_bmode_prob[VP9_NKF_BINTRAMODES - 1];
  vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-#if CONFIG_SUPERBLOCKS
  vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1];
-#endif
  vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
  vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];
  vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
  vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];
  unsigned int bmode_counts[VP9_NKF_BINTRAMODES];
  unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */
-#if CONFIG_SUPERBLOCKS
  unsigned int sb_ymode_counts[VP9_I32X32_MODES];
-#endif
  unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES];
  unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */
  unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
  unsigned int mbsplit_counts[VP9_NUMMBSPLITS];

-  vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs pre_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs pre_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs pre_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES_32X32];

-  vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
-  vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
-  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];

  nmv_context_counts NMVcount;
  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
@ -139,16 +119,17 @@ typedef enum {
  ONLY_4X4            = 0,
  ALLOW_8X8           = 1,
  ALLOW_16X16         = 2,
-  TX_MODE_SELECT      = 3,
-  NB_TXFM_MODES       = 4,
+  ALLOW_32X32         = 3,
+  TX_MODE_SELECT      = 4,
+  NB_TXFM_MODES       = 5,
 } TXFM_MODE;

 typedef struct VP9Common {
  struct vpx_internal_error_info  error;

-  DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, Y1dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, Y2dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, UVdequant[QINDEX_RANGE][16]);

  int Width;
  int Height;
@ -234,7 +215,7 @@ typedef struct VP9Common {

  /* Y,U,V,Y2 */
  ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
+  ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */

  /* keyframe block modes are predicted by their above, left neighbors */

@ -242,9 +223,7 @@ typedef struct VP9Common {
                        [VP9_KF_BINTRAMODES]
                        [VP9_KF_BINTRAMODES - 1];
  vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */
-#if CONFIG_SUPERBLOCKS
  vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];
-#endif
  int kf_ymode_probs_index;
  int kf_ymode_probs_update;
  vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];
@ -252,9 +231,8 @@ typedef struct VP9Common {
  vp9_prob prob_intra_coded;
  vp9_prob prob_last_coded;
  vp9_prob prob_gf_coded;
-#if CONFIG_SUPERBLOCKS
-  vp9_prob sb_coded;
-#endif
+  vp9_prob sb32_coded;
+  vp9_prob sb64_coded;

  // Context probabilities when using predictive coding of segment id
  vp9_prob segment_pred_probs[PREDICTION_PROBS];
@ -268,7 +246,7 @@ typedef struct VP9Common {
  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

  // FIXME contextualize
-  vp9_prob prob_tx[TX_SIZE_MAX - 1];
+  vp9_prob prob_tx[TX_SIZE_MAX_SB - 1];

  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];

@ -290,17 +268,10 @@ typedef struct VP9Common {
  struct postproc_state  postproc_state;
 #endif

-#if CONFIG_PRED_FILTER
-  /* Prediction filter variables */
-  int pred_filter_mode;   // 0=disabled at the frame level (no MB filtered)
-  // 1=enabled at the frame level (all MB filtered)
-  // 2=specified per MB (1=filtered, 0=non-filtered)
-  vp9_prob prob_pred_filter_off;
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
  int use_interintra;
 #endif

 } VP9_COMMON;

-#endif  // __INC_ONYX_INT_H
+#endif  // VP9_COMMON_VP9_ONYXC_INT_H_
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@ -13,7 +13,7 @@
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_postproc.h"
 #include "vp9/common/vp9_textblit.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
@ -132,20 +132,20 @@ const short vp9_rv[] = {

 /****************************************************************************
 */
-void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
-                                     unsigned char *dst_ptr,
+void vp9_post_proc_down_and_across_c(uint8_t *src_ptr,
+                                     uint8_t *dst_ptr,
                                     int src_pixels_per_line,
                                     int dst_pixels_per_line,
                                     int rows,
                                     int cols,
                                     int flimit) {
-  unsigned char *p_src, *p_dst;
+  uint8_t *p_src, *p_dst;
  int row;
  int col;
  int i;
  int v;
  int pitch = src_pixels_per_line;
-  unsigned char d[8];
+  uint8_t d[8];
  (void)dst_pixels_per_line;

  for (row = 0; row < rows; row++) {
@ -215,12 +215,12 @@ static int q2mbl(int x) {
  return x * x / 3;
 }

-void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
+void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
                                 int rows, int cols, int flimit) {
  int r, c, i;

-  unsigned char *s = src;
-  unsigned char d[16];
+  uint8_t *s = src;
+  uint8_t d[16];


  for (r = 0; r < rows; r++) {
@ -253,16 +253,16 @@ void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
  }
 }

-void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
+void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
                            int rows, int cols, int flimit) {
  int r, c, i;
  const short *rv3 = &vp9_rv[63 & rand()];

  for (c = 0; c < cols; c++) {
-    unsigned char *s = &dst[c];
+    uint8_t *s = &dst[c];
    int sumsq = 0;
    int sum   = 0;
-    unsigned char d[16];
+    uint8_t d[16];
    const short *rv2 = rv3 + ((c * 17) & 127);

    for (i = -8; i <= 6; i++) {
@ -439,7 +439,7 @@ static void fillrd(struct postproc_state *state, int q, int a) {
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
-void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
+void vp9_plane_add_noise_c(uint8_t *Start, char *noise,
                           char blackclamp[16],
                           char whiteclamp[16],
                           char bothclamp[16],
@ -447,7 +447,7 @@ void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
  unsigned int i, j;

  for (i = 0; i < Height; i++) {
-    unsigned char *Pos = Start + i * Pitch;
+    uint8_t *Pos = Start + i * Pitch;
    char  *Ref = (char *)(noise + (rand() & 0xff));

    for (j = 0; j < Width; j++) {
@ -466,7 +466,7 @@ void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
 * edges unblended to give distinction to macro blocks in areas
 * filled with the same color block.
 */
-void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
+void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v,
                          int y1, int u1, int v1, int alpha, int stride) {
  int i, j;
  int y1_const = y1 * ((1 << 16) - alpha);
@ -499,7 +499,7 @@ void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
 /* Blend only the edge of the macro block.  Leave center
 * unblended to allow for other visualizations to be layered.
 */
-void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
+void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v,
                          int y1, int u1, int v1, int alpha, int stride) {
  int i, j;
  int y1_const = y1 * ((1 << 16) - alpha);
@ -554,7 +554,7 @@ void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
  }
 }

-void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
+void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v,
                   int y1, int u1, int v1, int alpha, int stride) {
  int i, j;
  int y1_const = y1 * ((1 << 16) - alpha);
@ -688,7 +688,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,

  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
    int i, j;
-    unsigned char *y_ptr;
+    uint8_t *y_ptr;
    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int mb_rows = post->y_height >> 4;
    int mb_cols = post->y_width  >> 4;
@ -717,7 +717,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,

  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
    int i, j;
-    unsigned char *y_ptr;
+    uint8_t *y_ptr;
    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int mb_rows = post->y_height >> 4;
    int mb_cols = post->y_width  >> 4;
@ -764,7 +764,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+    uint8_t *y_buffer = oci->post_proc_buffer.y_buffer;
    int y_stride = oci->post_proc_buffer.y_stride;
    MODE_INFO *mi = oci->mi;
    int x0, y0;
@ -906,9 +906,9 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
    int y_stride = oci->post_proc_buffer.y_stride;
    MODE_INFO *mi = oci->mi;

@ -920,7 +920,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
            ((ppflags->display_mb_modes_flag & B_PRED) ||
             ppflags->display_b_modes_flag)) {
          int by, bx;
-          unsigned char *yl, *ul, *vl;
+          uint8_t *yl, *ul, *vl;
          union b_mode_info *bmi = mi->bmi;

          yl = y_ptr + x;
@ -971,9 +971,9 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
    int width  = post->y_width;
    int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
    int y_stride = oci->post_proc_buffer.y_stride;
    MODE_INFO *mi = oci->mi;

--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@ -38,4 +38,5 @@ void vp9_deblock(YV12_BUFFER_CONFIG         *source,
                 int                         q,
                 int                         low_var_thresh,
                 int                         flag);
-#endif
+
+#endif  // VP9_COMMON_VP9_POSTPROC_H_
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h
@ -8,9 +8,9 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_PPFLAGS_H_
 #define VP9_COMMON_VP9_PPFLAGS_H_
+
 enum {
  VP9D_NOFILTERING            = 0,
  VP9D_DEBLOCK                = 1 << 0,
@ -35,4 +35,4 @@ typedef struct {
  int display_mv_flag;
 } vp9_ppflags_t;

-#endif
+#endif  // VP9_COMMON_VP9_PPFLAGS_H_
--- a/vp9/common/vp9_pragmas.h
+++ b/vp9/common/vp9_pragmas.h
@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
-
+#ifndef VP9_COMMON_VP9_PRAGMAS_H_
+#define VP9_COMMON_VP9_PRAGMAS_H_

 #ifdef __INTEL_COMPILER
 #pragma warning(disable:997 1011 170)
@ -17,3 +17,5 @@
 #ifdef _MSC_VER
 #pragma warning(disable:4799)
 #endif
+
+#endif  // VP9_COMMON_VP9_PRAGMAS_H_
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@ -9,8 +9,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_treecoder.h"

 // TBD prediction functions for various bitstream signals

@ -221,54 +223,57 @@ unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
 void vp9_set_pred_flag(MACROBLOCKD *const xd,
                       PRED_ID pred_id,
                       unsigned char pred_flag) {
-#if CONFIG_SUPERBLOCKS
  const int mis = xd->mode_info_stride;
-#endif

  switch (pred_id) {
    case PRED_SEG_ID:
      xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted =
+                pred_flag;
+          }
        }
      }
-#endif
      break;

    case PRED_REF:
      xd->mode_info_context->mbmi.ref_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.ref_predicted = pred_flag;
+          }
        }
      }
-#endif
      break;

    case PRED_MBSKIP:
      xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
+          }
        }
      }
-#endif
      break;

    default:
@ -286,25 +291,25 @@ unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
                                    const MACROBLOCKD *const xd, int MbIndex) {
  // Currently the prediction for the macroblock segment ID is
  // the value stored for this macroblock in the previous frame.
-#if CONFIG_SUPERBLOCKS
-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
+  if (!xd->mode_info_context->mbmi.sb_type) {
    return cm->last_frame_seg_map[MbIndex];
-#if CONFIG_SUPERBLOCKS
  } else {
-    int seg_id = cm->last_frame_seg_map[MbIndex];
-    int mb_col = MbIndex % cm->mb_cols;
-    int mb_row = MbIndex / cm->mb_cols;
-    if (mb_col + 1 < cm->mb_cols)
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];
-    if (mb_row + 1 < cm->mb_rows) {
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];
-      if (mb_col + 1 < cm->mb_cols)
-        seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];
+    const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+    const int mb_col = MbIndex % cm->mb_cols;
+    const int mb_row = MbIndex / cm->mb_cols;
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    int x, y;
+    unsigned seg_id = -1;
+
+    for (y = mb_row; y < mb_row + y_mbs; y++) {
+      for (x = mb_col; x < mb_col + x_mbs; x++) {
+        seg_id = MIN(seg_id, cm->last_frame_seg_map[cm->mb_cols * y + x]);
+      }
    }
+
    return seg_id;
  }
-#endif
 }

 MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
@ -383,26 +388,13 @@ void vp9_calc_ref_probs(int *count, vp9_prob *probs) {
  int tot_count;

  tot_count = count[0] + count[1] + count[2] + count[3];
-  if (tot_count) {
-    probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);
-    probs[0] += !probs[0];
-  } else
-    probs[0] = 128;
+  probs[0] = get_prob(count[0], tot_count);

  tot_count -= count[0];
-  if (tot_count) {
-    probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);
-    probs[1] += !probs[1];
-  } else
-    probs[1] = 128;
+  probs[1] = get_prob(count[1], tot_count);

  tot_count -= count[1];
-  if (tot_count) {
-    probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);
-    probs[2] += !probs[2];
-  } else
-    probs[2] = 128;
-
+  probs[2] = get_prob(count[2], tot_count);
 }

 // Computes a set of modified conditional probabilities for the reference frame
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"

@ -53,4 +52,4 @@ extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
                                       const MACROBLOCKD *const xd);
 extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);

-#endif /* __INC_PRED_COMMON_H__ */
+#endif  // VP9_COMMON_VP9_PRED_COMMON_H_
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@ -8,15 +8,19 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
+#define VP9_COMMON_VP9_QUANT_COMMON_H_

 #include "string.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"

-extern void vp9_init_quant_tables();
+extern void vp9_init_quant_tables(void);
 extern int vp9_ac_yquant(int QIndex);
 extern int vp9_dc_quant(int QIndex, int Delta);
 extern int vp9_dc2quant(int QIndex, int Delta);
 extern int vp9_ac2quant(int QIndex, int Delta);
 extern int vp9_dc_uv_quant(int QIndex, int Delta);
 extern int vp9_ac_uv_quant(int QIndex, int Delta);
+
+#endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@ -13,26 +13,15 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"

-void vp9_recon_b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+void vp9_recon_b_c(uint8_t *pred_ptr,
+                   int16_t *diff_ptr,
+                   uint8_t *dst_ptr,
+                   int stride) {
  int r, c;

  for (r = 0; r < 4; r++) {
    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
    }

    dst_ptr += stride;
@ -41,26 +30,15 @@ void vp9_recon_b_c
  }
 }

-void vp9_recon_uv_b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+void vp9_recon_uv_b_c(uint8_t *pred_ptr,
+                      int16_t *diff_ptr,
+                      uint8_t *dst_ptr,
+                      int stride) {
  int r, c;

  for (r = 0; r < 4; r++) {
    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
    }

    dst_ptr += stride;
@ -68,26 +46,16 @@ void vp9_recon_uv_b_c
    pred_ptr += 8;
  }
 }
-void vp9_recon4b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+
+void vp9_recon4b_c(uint8_t *pred_ptr,
+                   int16_t *diff_ptr,
+                   uint8_t *dst_ptr,
+                   int stride) {
  int r, c;

  for (r = 0; r < 4; r++) {
    for (c = 0; c < 16; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
    }

    dst_ptr += stride;
@ -96,26 +64,15 @@ void vp9_recon4b_c
  }
 }

-void vp9_recon2b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+void vp9_recon2b_c(uint8_t *pred_ptr,
+                   int16_t *diff_ptr,
+                   uint8_t *dst_ptr,
+                   int stride) {
  int r, c;

  for (r = 0; r < 4; r++) {
    for (c = 0; c < 8; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
    }

    dst_ptr += stride;
@ -124,21 +81,15 @@ void vp9_recon2b_c
  }
 }

-#if CONFIG_SUPERBLOCKS
 void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
  int x, y;
  BLOCKD *b = &xd->block[0];
  int stride = b->dst_stride;
-  short *diff = b->diff;
+  int16_t *diff = b->diff;

  for (y = 0; y < 16; y++) {
    for (x = 0; x < 16; x++) {
-      int a = dst[x] + diff[x];
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-      dst[x] = a;
+      dst[x] = clip_pixel(dst[x] + diff[x]);
    }
    dst += stride;
    diff += 16;
@ -152,23 +103,47 @@ void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
  for (i = 0; i < 2; i++, dst = vdst) {
    BLOCKD *b = &xd->block[16 + 4 * i];
    int stride = b->dst_stride;
-    short *diff = b->diff;
+    int16_t *diff = b->diff;

    for (y = 0; y < 8; y++) {
      for (x = 0; x < 8; x++) {
-        int a = dst[x] + diff[x];
-        if (a < 0)
-          a = 0;
-        else if (a > 255)
-          a = 255;
-        dst[x] = a;
+        dst[x] = clip_pixel(dst[x] + diff[x]);
      }
      dst += stride;
      diff += 8;
    }
  }
 }
-#endif
+
+void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y, stride = xd->block[0].dst_stride;
+  int16_t *diff = xd->sb_coeff_data.diff;
+
+  for (y = 0; y < 32; y++) {
+    for (x = 0; x < 32; x++) {
+      dst[x] = clip_pixel(dst[x] + diff[x]);
+    }
+    dst += stride;
+    diff += 32;
+  }
+}
+
+void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, stride = xd->block[16].dst_stride;
+  int16_t *udiff = xd->sb_coeff_data.diff + 1024;
+  int16_t *vdiff = xd->sb_coeff_data.diff + 1280;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      udst[x] = clip_pixel(udst[x] + udiff[x]);
+      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);
+    }
+    udst += stride;
+    vdst += stride;
+    udiff += 16;
+    vdiff += 16;
+  }
+}

 void vp9_recon_mby_c(MACROBLOCKD *xd) {
  int i;
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@ -18,45 +18,58 @@
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
                              VP9_COMMON *cm) {
+#if CONFIG_ENABLE_6TAP
  if (mcomp_filter_type == SIXTAP) {
-    xd->subpixel_predict        = vp9_sixtap_predict;
+    xd->subpixel_predict4x4     = vp9_sixtap_predict4x4;
    xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;
    xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;
    xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;
-    xd->subpixel_predict_avg    = vp9_sixtap_predict_avg;
+    xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;
    xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;
    xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;
-  } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
-    xd->subpixel_predict        = vp9_eighttap_predict;
+  } else {
+#endif
+  if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
+    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4;
    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;
    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;
    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;
-    xd->subpixel_predict_avg    = vp9_eighttap_predict_avg4x4;
+    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;
    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;
    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;
+  } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {
+    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_smooth;
+    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_smooth;
+    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_smooth;
+    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_smooth;
+    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;
+    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;
+    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;
  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
-    xd->subpixel_predict        = vp9_eighttap_predict_sharp;
+    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_sharp;
    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;
    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;
    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;
-    xd->subpixel_predict_avg    = vp9_eighttap_predict_avg4x4_sharp;
+    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;
    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;
    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;
-  }
-  else {
-    xd->subpixel_predict        = vp9_bilinear_predict4x4;
+  } else {
+    xd->subpixel_predict4x4     = vp9_bilinear_predict4x4;
    xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;
    xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;
    xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;
-    xd->subpixel_predict_avg    = vp9_bilinear_predict_avg4x4;
+    xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;
    xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;
    xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;
  }
+#if CONFIG_ENABLE_6TAP
+  }
+#endif
 }

-void vp9_copy_mem16x16_c(unsigned char *src,
+void vp9_copy_mem16x16_c(uint8_t *src,
                         int src_stride,
-                         unsigned char *dst,
+                         uint8_t *dst,
                         int dst_stride) {
  int r;

@ -91,9 +104,9 @@ void vp9_copy_mem16x16_c(unsigned char *src,
  }
 }

-void vp9_avg_mem16x16_c(unsigned char *src,
+void vp9_avg_mem16x16_c(uint8_t *src,
                        int src_stride,
-                        unsigned char *dst,
+                        uint8_t *dst,
                        int dst_stride) {
  int r;

@ -109,9 +122,9 @@ void vp9_avg_mem16x16_c(unsigned char *src,
  }
 }

-void vp9_copy_mem8x8_c(unsigned char *src,
+void vp9_copy_mem8x8_c(uint8_t *src,
                       int src_stride,
-                       unsigned char *dst,
+                       uint8_t *dst,
                       int dst_stride) {
  int r;

@ -134,9 +147,9 @@ void vp9_copy_mem8x8_c(unsigned char *src,
  }
 }

-void vp9_avg_mem8x8_c(unsigned char *src,
+void vp9_avg_mem8x8_c(uint8_t *src,
                      int src_stride,
-                      unsigned char *dst,
+                      uint8_t *dst,
                      int dst_stride) {
  int r;

@ -152,9 +165,9 @@ void vp9_avg_mem8x8_c(unsigned char *src,
  }
 }

-void vp9_copy_mem8x4_c(unsigned char *src,
+void vp9_copy_mem8x4_c(uint8_t *src,
                       int src_stride,
-                       unsigned char *dst,
+                       uint8_t *dst,
                       int dst_stride) {
  int r;

@ -179,9 +192,9 @@ void vp9_copy_mem8x4_c(unsigned char *src,

 void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
  int r;
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
  int_mv mv;

  ptr_base = *(d->base_pre);
@ -221,9 +234,9 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
 void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
                                      vp9_subpix_fn_t sppf) {
  int r;
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
  int_mv mv;

  ptr_base = *(d->base_second_pre);
@ -251,9 +264,9 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
 }

 void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
  int_mv mv;

  ptr_base = *(d->base_pre);
@ -277,9 +290,9 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
 */
 void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
                                      BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
  int_mv mv;

  ptr_base = *(d->base_second_pre);
@ -296,9 +309,9 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
 }

 static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
  int_mv mv;

  ptr_base = *(d->base_pre);
@ -314,132 +327,6 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
  }
 }

-
-/*encoder only*/
-#if CONFIG_PRED_FILTER
-
-// Select the thresholded or non-thresholded filter
-#define USE_THRESH_FILTER 0
-
-#define PRED_FILT_LEN 5
-
-static const int filt_shift = 4;
-static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};
-// Alternative filter {1, 1, 4, 1, 1}
-
-#if !USE_THRESH_FILTER
-void filter_mb(unsigned char *src, int src_stride,
-               unsigned char *dst, int dst_stride,
-               int width, int height) {
-  int i, j, k;
-  unsigned int Temp[32 * 32];
-  unsigned int  *pTmp = Temp;
-  unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);
-
-  // Horizontal
-  for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {
-    for (j = 0; j < width; j++) {
-      int sum = 0;
-      for (k = 0; k < PRED_FILT_LEN; k++)
-        sum += pSrc[j + k] * pred_filter[k];
-      pTmp[j] = sum;
-    }
-
-    pSrc += src_stride;
-    pTmp += width;
-  }
-
-  // Vertical
-  pTmp = Temp;
-  for (i = 0; i < width; i++) {
-    unsigned char *pDst = dst + i;
-    for (j = 0; j < height; j++) {
-      int sum = 0;
-      for (k = 0; k < PRED_FILT_LEN; k++)
-        sum += pTmp[(j + k) * width] * pred_filter[k];
-      // Round
-      sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);
-      pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);
-    }
-    ++pTmp;
-  }
-}
-#else
-// Based on vp9_post_proc_down_and_across_c (vp9_postproc.c)
-void filter_mb(unsigned char *src, int src_stride,
-               unsigned char *dst, int dst_stride,
-               int width, int height) {
-  unsigned char *pSrc, *pDst;
-  int row;
-  int col;
-  int i;
-  int v;
-  unsigned char d[8];
-
-  /* TODO flimit should be linked to the quantizer value */
-  int flimit = 7;
-
-  for (row = 0; row < height; row++) {
-    /* post_proc_down for one row */
-    pSrc = src;
-    pDst = dst;
-
-    for (col = 0; col < width; col++) {
-      int kernel = (1 << (filt_shift - 1));
-      int v = pSrc[col];
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - pSrc[col + i * src_stride]) > flimit)
-          goto down_skip_convolve;
-
-        kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];
-      }
-
-      v = (kernel >> filt_shift);
-    down_skip_convolve:
-      pDst[col] = v;
-    }
-
-    /* now post_proc_across */
-    pSrc = dst;
-    pDst = dst;
-
-    for (i = 0; i < 8; i++)
-      d[i] = pSrc[i];
-
-    for (col = 0; col < width; col++) {
-      int kernel = (1 << (filt_shift - 1));
-      v = pSrc[col];
-
-      d[col & 7] = v;
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - pSrc[col + i]) > flimit)
-          goto across_skip_convolve;
-
-        kernel += pred_filter[2 + i] * pSrc[col + i];
-      }
-
-      d[col & 7] = (kernel >> filt_shift);
-    across_skip_convolve:
-
-      if (col >= 2)
-        pDst[col - 2] = d[(col - 2) & 7];
-    }
-
-    /* handle the last two pixels */
-    pDst[col - 2] = d[(col - 2) & 7];
-    pDst[col - 1] = d[(col - 1) & 7];
-
-    /* next row */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-#endif  // !USE_THRESH_FILTER
-
-#endif  // CONFIG_PRED_FILTER
-
 /*encoder only*/
 void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
  int i, j;
@ -524,13 +411,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
      build_inter_predictors2b(xd, d0, 8);
    else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
    }

    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
    }
  }
 }
@ -573,11 +460,11 @@ static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

 /*encoder only*/
 void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             unsigned char *dst_y,
+                                             uint8_t *dst_y,
                                             int dst_ystride,
                                             int clamp_mvs) {
-  unsigned char *ptr_base = xd->pre.y_buffer;
-  unsigned char *ptr;
+  uint8_t *ptr_base = xd->pre.y_buffer;
+  uint8_t *ptr;
  int pre_stride = xd->block[0].pre_stride;
  int_mv ymv;

@ -588,29 +475,6 @@ void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);

-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
-      // Sub-pel filter needs extended input
-      int len = 15 + (VP9_INTERP_EXTEND << 1);
-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-      unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-
-      // Copy extended MB into Temp array, applying the spatial filter
-      filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                Temp, len, len, len);
-
-      // Sub-pel interpolation
-      xd->subpixel_predict16x16(pTemp, len,
-                                (ymv.as_mv.col & 7) << 1,
-                                (ymv.as_mv.row & 7) << 1,
-                                dst_y, dst_ystride);
-    } else {
-      // Apply spatial filter to create the prediction directly
-      filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);
-    }
-  } else
-#endif
    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
      xd->subpixel_predict16x16(ptr, pre_stride,
                                (ymv.as_mv.col & 7) << 1,
@ -622,11 +486,11 @@ void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
 }

 void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              unsigned char *dst_u,
-                                              unsigned char *dst_v,
+                                              uint8_t *dst_u,
+                                              uint8_t *dst_v,
                                              int dst_uvstride) {
  int offset;
-  unsigned char *uptr, *vptr;
+  uint8_t *uptr, *vptr;
  int pre_stride = xd->block[0].pre_stride;
  int_mv _o16x16mv;
  int_mv _16x16mv;
@ -659,37 +523,6 @@ void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
  uptr = xd->pre.u_buffer + offset;
  vptr = xd->pre.v_buffer + offset;

-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    int i;
-    unsigned char *pSrc = uptr;
-    unsigned char *pDst = dst_u;
-    int len = 7 + (VP9_INTERP_EXTEND << 1);
-    unsigned char Temp[32 * 32]; // Data required by the sub-pel filter
-    unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-
-    // U & V
-    for (i = 0; i < 2; i++) {
-      if (_o16x16mv.as_int & 0x000f000f) {
-        // Copy extended MB into Temp array, applying the spatial filter
-        filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                  Temp, len, len, len);
-
-        // Sub-pel filter
-        xd->subpixel_predict8x8(pTemp, len,
-                                _o16x16mv.as_mv.col & 15,
-                                _o16x16mv.as_mv.row & 15,
-                                pDst, dst_uvstride);
-      } else {
-        filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);
-      }
-
-      // V
-      pSrc = vptr;
-      pDst = dst_v;
-    }
-  } else
-#endif
    if (_o16x16mv.as_int & 0x000f000f) {
      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
@ -703,20 +536,19 @@ void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,


 void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            unsigned char *dst_y,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
+                                            uint8_t *dst_y,
+                                            uint8_t *dst_u,
+                                            uint8_t *dst_v,
                                            int dst_ystride, int dst_uvstride) {
  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
      xd->mode_info_context->mbmi.need_to_clamp_mvs);
  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
 }

-#if CONFIG_SUPERBLOCKS
 void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                        unsigned char *dst_y,
-                                        unsigned char *dst_u,
-                                        unsigned char *dst_v,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
                                        int dst_ystride,
                                        int dst_uvstride) {
  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
@ -781,14 +613,77 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
  }
 #endif
 }
+
+void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride) {
+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+          *v2 = x->second_pre.v_buffer;
+  int edge[4], n;
+
+  edge[0] = x->mb_to_top_edge;
+  edge[1] = x->mb_to_bottom_edge;
+  edge[2] = x->mb_to_left_edge;
+  edge[3] = x->mb_to_right_edge;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
+
+    x->pre.y_buffer = y1 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
+    x->pre.u_buffer = u1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    x->pre.v_buffer = v1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+
+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
+      x->second_pre.y_buffer = y2 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
+      x->second_pre.u_buffer = u2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+      x->second_pre.v_buffer = v2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    }
+
+    vp9_build_inter32x32_predictors_sb(x,
+        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,
+        dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,
+        dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,
+        dst_ystride, dst_uvstride);
+  }
+
+  x->mb_to_top_edge    = edge[0];
+  x->mb_to_bottom_edge = edge[1];
+  x->mb_to_left_edge   = edge[2];
+  x->mb_to_right_edge  = edge[3];
+
+  x->pre.y_buffer = y1;
+  x->pre.u_buffer = u1;
+  x->pre.v_buffer = v1;
+
+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
+    x->second_pre.y_buffer = y2;
+    x->second_pre.u_buffer = u2;
+    x->second_pre.v_buffer = v2;
+  }
+
+#if CONFIG_COMP_INTERINTRA_PRED
+  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+    vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v,
+                                             dst_ystride, dst_uvstride);
+  }
 #endif
+}

 /*
 * The following functions should be called after an initial
 * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
- * It will run a second sixtap filter on a (different) ref
+ * It will run a second filter on a (different) ref
 * frame and average the result with the output of the
- * first sixtap filter. The second reference frame is stored
+ * first filter. The second reference frame is stored
 * in x->second_pre (the reference frame index is in
 * x->mode_info_context->mbmi.second_ref_frame). The second
 * motion vector is x->mode_info_context->mbmi.second_mv.
@ -798,15 +693,15 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
 * single reference framer.
 */
 void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             unsigned char *dst_y,
+                                             uint8_t *dst_y,
                                             int dst_ystride) {
-  unsigned char *ptr;
+  uint8_t *ptr;

  int_mv _16x16mv;
  int mv_row;
  int mv_col;

-  unsigned char *ptr_base = xd->second_pre.y_buffer;
+  uint8_t *ptr_base = xd->second_pre.y_buffer;
  int pre_stride = xd->block[0].pre_stride;

  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
@ -819,44 +714,20 @@ void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,

  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);

-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    if ((mv_row | mv_col) & 7) {
-      // Sub-pel filter needs extended input
-      int len = 15 + (VP9_INTERP_EXTEND << 1);
-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-      unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-
-      // Copy extended MB into Temp array, applying the spatial filter
-      filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                Temp, len, len, len);
-
-      // Sub-pel filter
-      xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,
-                                    (mv_row & 7) << 1, dst_y, dst_ystride);
-    } else {
-      // TODO Needs to AVERAGE with the dst_y
-      // For now, do not apply the prediction filter in these cases!
-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
-  } else
-#endif  // CONFIG_PRED_FILTER
-  {
-    if ((mv_row | mv_col) & 7) {
-      xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
-                                    (mv_row & 7) << 1, dst_y, dst_ystride);
-    } else {
-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
+  if ((mv_row | mv_col) & 7) {
+    xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
+                                  (mv_row & 7) << 1, dst_y, dst_ystride);
+  } else {
+    vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
  }
 }

 void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              unsigned char *dst_u,
-                                              unsigned char *dst_v,
+                                              uint8_t *dst_u,
+                                              uint8_t *dst_v,
                                              int dst_uvstride) {
  int offset;
-  unsigned char *uptr, *vptr;
+  uint8_t *uptr, *vptr;

  int_mv _16x16mv;
  int mv_row;
@ -887,37 +758,6 @@ void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
  uptr = xd->second_pre.u_buffer + offset;
  vptr = xd->second_pre.v_buffer + offset;

-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    int i;
-    int len = 7 + (VP9_INTERP_EXTEND << 1);
-    unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-    unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-    unsigned char *pSrc = uptr;
-    unsigned char *pDst = dst_u;
-
-    // U & V
-    for (i = 0; i < 2; i++) {
-      if ((omv_row | omv_col) & 15) {
-        // Copy extended MB into Temp array, applying the spatial filter
-        filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                  Temp, len, len, len);
-
-        // Sub-pel filter
-        xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,
-                                    omv_row & 15, pDst, dst_uvstride);
-      } else {
-        // TODO Needs to AVERAGE with the dst_[u|v]
-        // For now, do not apply the prediction filter here!
-        vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);
-      }
-
-      // V
-      pSrc = vptr;
-      pDst = dst_v;
-    }
-  } else
-#endif  // CONFIG_PRED_FILTER
    if ((omv_row | omv_col) & 15) {
      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
                                  omv_row & 15, dst_u, dst_uvstride);
@ -930,9 +770,9 @@ void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
 }

 void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            unsigned char *dst_y,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
+                                            uint8_t *dst_y,
+                                            uint8_t *dst_u,
+                                            uint8_t *dst_v,
                                            int dst_ystride,
                                            int dst_uvstride) {
  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
@ -995,13 +835,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
        build_inter_predictors2b(xd, d0, 16);
      else {
-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);
-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);
+        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);
+        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);
      }

      if (mbmi->second_ref_frame > 0) {
-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);
-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);
+        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);
+        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);
      }
    }
  }
@ -1013,13 +853,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
      build_inter_predictors2b(xd, d0, 8);
    else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
    }

    if (mbmi->second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
    }
  }
 }
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@ -11,49 +11,55 @@
 #ifndef VP9_COMMON_VP9_RECONINTER_H_
 #define VP9_COMMON_VP9_RECONINTER_H_

+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_onyxc_int.h"

 extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    unsigned char *dst_y,
+                                                    uint8_t *dst_y,
                                                    int dst_ystride,
                                                    int clamp_mvs);

 extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
+                                                     uint8_t *dst_u,
+                                                     uint8_t *dst_v,
                                                     int dst_uvstride);

 extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   unsigned char *dst_y,
-                                                   unsigned char *dst_u,
-                                                   unsigned char *dst_v,
+                                                   uint8_t *dst_y,
+                                                   uint8_t *dst_u,
+                                                   uint8_t *dst_v,
                                                   int dst_ystride,
                                                   int dst_uvstride);

 extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    unsigned char *dst_y,
+                                                    uint8_t *dst_y,
                                                    int dst_ystride);

 extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
+                                                     uint8_t *dst_u,
+                                                     uint8_t *dst_v,
                                                     int dst_uvstride);

 extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   unsigned char *dst_y,
-                                                   unsigned char *dst_u,
-                                                   unsigned char *dst_v,
+                                                   uint8_t *dst_y,
+                                                   uint8_t *dst_u,
+                                                   uint8_t *dst_v,
                                                   int dst_ystride,
                                                   int dst_uvstride);

-#if CONFIG_SUPERBLOCKS
 extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                               unsigned char *dst_y,
-                                               unsigned char *dst_u,
-                                               unsigned char *dst_v,
+                                               uint8_t *dst_y,
+                                               uint8_t *dst_u,
+                                               uint8_t *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
+
+extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                               uint8_t *dst_y,
+                                               uint8_t *dst_u,
+                                               uint8_t *dst_v,
                                               int dst_ystride,
                                               int dst_uvstride);
-#endif

 extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);

@ -75,4 +81,4 @@ extern void vp9_setup_interp_filters(MACROBLOCKD *xd,
                                     INTERPOLATIONFILTERTYPE filter,
                                     VP9_COMMON *cm);

-#endif  // __INC_RECONINTER_H
+#endif  // VP9_COMMON_VP9_RECONINTER_H_
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@ -124,18 +124,20 @@ static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
    }
  }
  for (c = 0; c <= r; ++c) {
-    int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];
-    int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];
-    yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));
-    yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));
+    int yabove_ext = yabove_row[r];  // clip_pixel(2 * yabove_row[r] -
+                                     //            yabove_row[r - 1]);
+    int yleft_ext = yleft_col[r];  // clip_pixel(2 * yleft_col[r] -
+                                   //            yleft_col[r-1]);
    ypred_ptr[(r - c) * y_stride + c] =
      (yabove_ext * (c + 1) +
       yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);
  }
  for (r = 1; r < n; ++r) {
-    for (c = n - r; c < n; ++c)
-      ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +
-                                     ypred_ptr[r * y_stride + c - 1] + 1) >> 1;
+    for (c = n - r; c < n; ++c) {
+      const int yabove_ext = ypred_ptr[(r - 1) * y_stride + c];
+      const int yleft_ext = ypred_ptr[r * y_stride + c - 1];
+      ypred_ptr[r * y_stride + c] = (yabove_ext + yleft_ext + 1) >> 1;
+    }
  }
 }

@ -196,9 +198,9 @@ static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,
  }
 }

-static void corner_predictor(unsigned char *ypred_ptr, int y_stride, int n,
-                             unsigned char *yabove_row,
-                             unsigned char *yleft_col) {
+static void corner_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                             uint8_t *yabove_row,
+                             uint8_t *yleft_col) {
  int mh, mv, maxgradh, maxgradv, x, y, nx, ny;
  int i, j;
  int top_left = yabove_row[-1];
@ -246,14 +248,14 @@ void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
  }
 }

-void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
-                                         unsigned char *ypred_ptr,
+void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride,
+                                         uint8_t *ypred_ptr,
                                         int y_stride, int mode, int bsize,
                                         int up_available, int left_available) {

-  unsigned char *yabove_row = src - src_stride;
-  unsigned char yleft_col[32];
-  unsigned char ytop_left = yabove_row[-1];
+  uint8_t *yabove_row = src - src_stride;
+  uint8_t yleft_col[64];
+  uint8_t ytop_left = yabove_row[-1];
  int r, c, i;

  for (i = 0; i < bsize; i++) {
@ -269,15 +271,19 @@ void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
      int average = 0;
      int log2_bsize_minus_1;

-      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
+      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32 ||
+             bsize == 64);
      if (bsize == 4) {
        log2_bsize_minus_1 = 1;
      } else if (bsize == 8) {
        log2_bsize_minus_1 = 2;
      } else if (bsize == 16) {
        log2_bsize_minus_1 = 3;
-      } else /* bsize == 32 */ {
+      } else if (bsize == 32) {
        log2_bsize_minus_1 = 4;
+      } else {
+        assert(bsize == 64);
+        log2_bsize_minus_1 = 5;
      }

      if (up_available || left_available) {
@ -321,15 +327,7 @@ void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
    case TM_PRED: {
      for (r = 0; r < bsize; r++) {
        for (c = 0; c < bsize; c++) {
-          int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
-
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-
-          ypred_ptr[c] = pred;
+          ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);
        }

        ypred_ptr += y_stride;
@ -374,9 +372,9 @@ void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,

 #if CONFIG_COMP_INTERINTRA_PRED
 static void combine_interintra(MB_PREDICTION_MODE mode,
-                               unsigned char *interpred,
+                               uint8_t *interpred,
                               int interstride,
-                               unsigned char *intrapred,
+                               uint8_t *intrapred,
                               int intrastride,
                               int size) {
  // TODO(debargha): Explore different ways of combining predictors
@ -523,16 +521,17 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
    71,  70,  70,  70,  69,  69,  69,  68,
    68,  68,  68,  68,  67,  67,  67,  67,
  };
-  int size_scale = (size == 32 ? 1 :
+  int size_scale = (size >= 32 ? 1 :
                    size == 16 ? 2 :
                    size == 8  ? 4 : 8);
+  int size_shift = size == 64 ? 1 : 0;
  int i, j;
  switch (mode) {
    case V_PRED:
      for (i = 0; i < size; ++i) {
        for (j = 0; j < size; ++j) {
          int k = i * interstride + j;
-          int scale = weights1d[i * size_scale];
+          int scale = weights1d[i * size_scale >> size_shift];
          interpred[k] =
              ((scale_max - scale) * interpred[k] +
               scale * intrapred[i * intrastride + j] + scale_round)
@ -545,7 +544,7 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
      for (i = 0; i < size; ++i) {
        for (j = 0; j < size; ++j) {
          int k = i * interstride + j;
-          int scale = weights1d[j * size_scale];
+          int scale = weights1d[j * size_scale >> size_shift];
          interpred[k] =
              ((scale_max - scale) * interpred[k] +
               scale * intrapred[i * intrastride + j] + scale_round)
@ -559,8 +558,9 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
      for (i = 0; i < size; ++i) {
        for (j = 0; j < size; ++j) {
          int k = i * interstride + j;
-          int scale = (weights2d[i * size_scale * 32 + j * size_scale] +
-                       weights1d[i * size_scale]) >> 1;
+          int scale = (weights2d[(i * size_scale * 32 +
+                                  j * size_scale) >> size_shift] +
+                       weights1d[i * size_scale >> size_shift]) >> 1;
          interpred[k] =
              ((scale_max - scale) * interpred[k] +
               scale * intrapred[i * intrastride + j] + scale_round)
@ -574,8 +574,9 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
      for (i = 0; i < size; ++i) {
        for (j = 0; j < size; ++j) {
          int k = i * interstride + j;
-          int scale = (weights2d[i * size_scale * 32 + j * size_scale] +
-                       weights1d[j * size_scale]) >> 1;
+          int scale = (weights2d[(i * size_scale * 32 +
+                                  j * size_scale) >> size_shift] +
+                       weights1d[j * size_scale >> size_shift]) >> 1;
          interpred[k] =
              ((scale_max - scale) * interpred[k] +
               scale * intrapred[i * intrastride + j] + scale_round)
@ -588,7 +589,8 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
      for (i = 0; i < size; ++i) {
        for (j = 0; j < size; ++j) {
          int k = i * interstride + j;
-          int scale = weights2d[i * size_scale * 32 + j * size_scale];
+          int scale = weights2d[(i * size_scale * 32 +
+                                 j * size_scale) >> size_shift];
          interpred[k] =
              ((scale_max - scale) * interpred[k] +
               scale * intrapred[i * intrastride + j] + scale_round)
@ -613,18 +615,18 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
 }

 void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                              unsigned char *ypred,
-                                              unsigned char *upred,
-                                              unsigned char *vpred,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
                                              int ystride, int uvstride) {
  vp9_build_interintra_16x16_predictors_mby(xd, ypred, ystride);
  vp9_build_interintra_16x16_predictors_mbuv(xd, upred, vpred, uvstride);
 }

 void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                               unsigned char *ypred,
+                                               uint8_t *ypred,
                                               int ystride) {
-  unsigned char intrapredictor[256];
+  uint8_t intrapredictor[256];
  vp9_build_intra_predictors_internal(
      xd->dst.y_buffer, xd->dst.y_stride,
      intrapredictor, 16,
@ -635,11 +637,11 @@ void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
 }

 void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                unsigned char *upred,
-                                                unsigned char *vpred,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
                                                int uvstride) {
-  unsigned char uintrapredictor[64];
-  unsigned char vintrapredictor[64];
+  uint8_t uintrapredictor[64];
+  uint8_t vintrapredictor[64];
  vp9_build_intra_predictors_internal(
      xd->dst.u_buffer, xd->dst.uv_stride,
      uintrapredictor, 8,
@ -656,11 +658,10 @@ void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
                     vpred, uvstride, vintrapredictor, 8, 8);
 }

-#if CONFIG_SUPERBLOCKS
 void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd,
-                                               unsigned char *ypred,
+                                               uint8_t *ypred,
                                               int ystride) {
-  unsigned char intrapredictor[1024];
+  uint8_t intrapredictor[1024];
  vp9_build_intra_predictors_internal(
      xd->dst.y_buffer, xd->dst.y_stride,
      intrapredictor, 32,
@ -671,11 +672,11 @@ void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd,
 }

 void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd,
-                                                unsigned char *upred,
-                                                unsigned char *vpred,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
                                                int uvstride) {
-  unsigned char uintrapredictor[256];
-  unsigned char vintrapredictor[256];
+  uint8_t uintrapredictor[256];
+  uint8_t vintrapredictor[256];
  vp9_build_intra_predictors_internal(
      xd->dst.u_buffer, xd->dst.uv_stride,
      uintrapredictor, 16,
@ -693,16 +694,56 @@ void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd,
 }

 void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                              unsigned char *ypred,
-                                              unsigned char *upred,
-                                              unsigned char *vpred,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
                                              int ystride,
                                              int uvstride) {
  vp9_build_interintra_32x32_predictors_sby(xd, ypred, ystride);
  vp9_build_interintra_32x32_predictors_sbuv(xd, upred, vpred, uvstride);
 }
-#endif
-#endif
+
+void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd,
+                                               uint8_t *ypred,
+                                               int ystride) {
+  uint8_t intrapredictor[4096];
+  const int mode = xd->mode_info_context->mbmi.interintra_mode;
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      intrapredictor, 64, mode, 64,
+                                      xd->up_available, xd->left_available);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
+                     ypred, ystride, intrapredictor, 64, 64);
+}
+
+void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
+                                                int uvstride) {
+  uint8_t uintrapredictor[1024];
+  uint8_t vintrapredictor[1024];
+  const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;
+  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
+                                      uintrapredictor, 32, mode, 32,
+                                      xd->up_available, xd->left_available);
+  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vintrapredictor, 32, mode, 32,
+                                      xd->up_available, xd->left_available);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
+                     upred, uvstride, uintrapredictor, 32, 32);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
+                     vpred, uvstride, vintrapredictor, 32, 32);
+}
+
+void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride) {
+  vp9_build_interintra_64x64_predictors_sby(xd, ypred, ystride);
+  vp9_build_interintra_64x64_predictors_sbuv(xd, upred, vpred, uvstride);
+}
+#endif  // CONFIG_COMP_INTERINTRA_PRED

 void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
@ -718,40 +759,23 @@ void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
                                      xd->up_available, xd->left_available);
 }

-#if CONFIG_SUPERBLOCKS
 void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                      xd->dst.y_buffer, xd->dst.y_stride,
                                      xd->mode_info_context->mbmi.mode, 32,
                                      xd->up_available, xd->left_available);
 }
-#endif
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
-  unsigned char predictor[2][256];
-  int i;

+void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) {
  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      predictor[0], 16,
-                                      xd->mode_info_context->mbmi.mode,
-                                      16, xd->up_available,
-                                      xd->left_available);
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      predictor[1], 16,
-                                      xd->mode_info_context->mbmi.second_mode,
-                                      16, xd->up_available,
-                                      xd->left_available);
-
-  for (i = 0; i < 256; i++) {
-    xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
-  }
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 64,
+                                      xd->up_available, xd->left_available);
 }
-#endif

 void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
-                                              unsigned char *upred_ptr,
-                                              unsigned char *vpred_ptr,
+                                              uint8_t *upred_ptr,
+                                              uint8_t *vpred_ptr,
                                              int uv_stride,
                                              int mode, int bsize) {
  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
@ -777,86 +801,36 @@ void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
                                           8);
 }

-#if CONFIG_SUPERBLOCKS
 void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
                                           xd->dst.v_buffer, xd->dst.uv_stride,
                                           xd->mode_info_context->mbmi.uv_mode,
                                           16);
 }
-#endif

-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  unsigned char predictor[2][2][64];
-  int i;
-
-  vp9_build_intra_predictors_mbuv_internal(
-      xd, predictor[0][0], predictor[1][0], 8,
-      xd->mode_info_context->mbmi.uv_mode, 8);
-  vp9_build_intra_predictors_mbuv_internal(
-      xd, predictor[0][1], predictor[1][1], 8,
-      xd->mode_info_context->mbmi.second_uv_mode, 8);
-  for (i = 0; i < 64; i++) {
-    xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
-    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
-                                   predictor[1][1][i] + 1) >> 1;
-  }
+void vp9_build_intra_predictors_sb64uv_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer, xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           32);
 }
-#endif

 void vp9_intra8x8_predict(BLOCKD *xd,
                          int mode,
-                          unsigned char *predictor) {
+                          uint8_t *predictor) {
  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
                                      xd->dst_stride, predictor, 16,
                                      mode, 8, 1, 1);
 }

-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra8x8_predict(BLOCKD *xd,
-                               int mode, int second_mode,
-                               unsigned char *out_predictor) {
-  unsigned char predictor[2][8 * 16];
-  int i, j;
-
-  vp9_intra8x8_predict(xd, mode, predictor[0]);
-  vp9_intra8x8_predict(xd, second_mode, predictor[1]);
-
-  for (i = 0; i < 8 * 16; i += 16) {
-    for (j = i; j < i + 8; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
 void vp9_intra_uv4x4_predict(BLOCKD *xd,
                             int mode,
-                             unsigned char *predictor) {
+                             uint8_t *predictor) {
  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
                                      xd->dst_stride, predictor, 8,
                                      mode, 4, 1, 1);
 }

-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,
-                                  int mode, int mode2,
-                                  unsigned char *out_predictor) {
-  unsigned char predictor[2][8 * 4];
-  int i, j;
-
-  vp9_intra_uv4x4_predict(xd, mode, predictor[0]);
-  vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);
-
-  for (i = 0; i < 4 * 8; i += 8) {
-    for (j = i; j < i + 4; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
 /* TODO: try different ways of use Y-UV mode correlation
   Current code assumes that a uv 4x4 block use same mode
   as corresponding Y 8x8 area
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@ -11,34 +11,40 @@
 #ifndef VP9_COMMON_VP9_RECONINTRA_H_
 #define VP9_COMMON_VP9_RECONINTRA_H_

+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"

 extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
-extern B_PREDICTION_MODE vp9_find_dominant_direction(unsigned char *ptr,
+extern B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
                                                     int stride, int n);
 extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
 #if CONFIG_COMP_INTERINTRA_PRED
 extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                                     unsigned char *ypred,
-                                                     unsigned char *upred,
-                                                     unsigned char *vpred,
+                                                     uint8_t *ypred,
+                                                     uint8_t *upred,
+                                                     uint8_t *vpred,
                                                     int ystride,
                                                     int uvstride);
 extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                                      unsigned char *ypred,
+                                                      uint8_t *ypred,
                                                      int ystride);
 extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                       unsigned char *upred,
-                                                       unsigned char *vpred,
+                                                       uint8_t *upred,
+                                                       uint8_t *vpred,
                                                       int uvstride);
-#if CONFIG_SUPERBLOCKS
+#endif  // CONFIG_COMP_INTERINTRA_PRED
+
 extern void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                                     unsigned char *ypred,
-                                                     unsigned char *upred,
-                                                     unsigned char *vpred,
+                                                     uint8_t *ypred,
+                                                     uint8_t *upred,
+                                                     uint8_t *vpred,
+                                                     int ystride,
+                                                     int uvstride);
+extern void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                                     uint8_t *ypred,
+                                                     uint8_t *upred,
+                                                     uint8_t *vpred,
                                                     int ystride,
                                                     int uvstride);
-#endif
-#endif

-#endif  // __INC_RECONINTRA_H
+#endif  // VP9_COMMON_VP9_RECONINTRA_H_
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@ -15,7 +15,7 @@
 #include "vp9_rtcd.h"

 #if CONFIG_NEWBINTRAMODES
-static int find_grad_measure(unsigned char *x, int stride, int n, int t,
+static int find_grad_measure(uint8_t *x, int stride, int n, int t,
                             int dx, int dy) {
  int i, j;
  int count = 0, gsum = 0, gdiv;
@ -35,8 +35,8 @@ static int find_grad_measure(unsigned char *x, int stride, int n, int t,
 }

 #if CONTEXT_PRED_REPLACEMENTS == 6
-B_PREDICTION_MODE vp9_find_dominant_direction(
-    unsigned char *ptr, int stride, int n) {
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n) {
  int g[8], i, imin, imax;
  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
  g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);
@ -72,8 +72,8 @@ B_PREDICTION_MODE vp9_find_dominant_direction(
  }
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 4
-B_PREDICTION_MODE vp9_find_dominant_direction(
-    unsigned char *ptr, int stride, int n) {
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n) {
  int g[8], i, imin, imax;
  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
@ -103,8 +103,8 @@ B_PREDICTION_MODE vp9_find_dominant_direction(
  }
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 0
-B_PREDICTION_MODE vp9_find_dominant_direction(
-    unsigned char *ptr, int stride, int n) {
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n) {
  int g[8], i, imin, imax;
  g[0] = find_grad_measure(ptr, stride, n, 4,  1, 0);
  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
@ -145,7 +145,7 @@ B_PREDICTION_MODE vp9_find_dominant_direction(
 #endif

 B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {
-  unsigned char *ptr = *(x->base_dst) + x->dst;
+  uint8_t *ptr = *(x->base_dst) + x->dst;
  int stride = x->dst_stride;
  return vp9_find_dominant_direction(ptr, stride, 4);
 }
@ -153,17 +153,17 @@ B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {

 void vp9_intra4x4_predict(BLOCKD *x,
                          int b_mode,
-                          unsigned char *predictor) {
+                          uint8_t *predictor) {
  int i, r, c;

-  unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
-  unsigned char Left[4];
-  unsigned char top_left = Above[-1];
+  uint8_t *above = *(x->base_dst) + x->dst - x->dst_stride;
+  uint8_t left[4];
+  uint8_t top_left = above[-1];

-  Left[0] = (*(x->base_dst))[x->dst - 1];
-  Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
-  Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
-  Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+  left[0] = (*(x->base_dst))[x->dst - 1];
+  left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+  left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+  left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];

 #if CONFIG_NEWBINTRAMODES
  if (b_mode == B_CONTEXT_PRED)
@ -175,8 +175,8 @@ void vp9_intra4x4_predict(BLOCKD *x,
      int expected_dc = 0;

      for (i = 0; i < 4; i++) {
-        expected_dc += Above[i];
-        expected_dc += Left[i];
+        expected_dc += above[i];
+        expected_dc += left[i];
      }

      expected_dc = (expected_dc + 4) >> 3;
@ -194,15 +194,7 @@ void vp9_intra4x4_predict(BLOCKD *x,
      /* prediction similar to true_motion prediction */
      for (r = 0; r < 4; r++) {
        for (c = 0; c < 4; c++) {
-          int pred = Above[c] - top_left + Left[r];
-
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-
-          predictor[c] = pred;
+          predictor[c] = clip_pixel(above[c] - top_left + left[r]);
        }

        predictor += 16;
@ -211,33 +203,30 @@ void vp9_intra4x4_predict(BLOCKD *x,
    break;

    case B_VE_PRED: {
-
      unsigned int ap[4];
-      ap[0] = Above[0];
-      ap[1] = Above[1];
-      ap[2] = Above[2];
-      ap[3] = Above[3];
+
+      ap[0] = above[0];
+      ap[1] = above[1];
+      ap[2] = above[2];
+      ap[3] = above[3];

      for (r = 0; r < 4; r++) {
        for (c = 0; c < 4; c++) {
-
          predictor[c] = ap[c];
        }

        predictor += 16;
      }
-
    }
    break;

-
    case B_HE_PRED: {
-
      unsigned int lp[4];
-      lp[0] = Left[0];
-      lp[1] = Left[1];
-      lp[2] = Left[2];
-      lp[3] = Left[3];
+
+      lp[0] = left[0];
+      lp[1] = left[1];
+      lp[2] = left[2];
+      lp[3] = left[3];

      for (r = 0; r < 4; r++) {
        for (c = 0; c < 4; c++) {
@ -249,7 +238,8 @@ void vp9_intra4x4_predict(BLOCKD *x,
    }
    break;
    case B_LD_PRED: {
-      unsigned char *ptr = Above;
+      uint8_t *ptr = above;
+
      predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
      predictor[0 * 16 + 1] =
        predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
@ -270,18 +260,17 @@ void vp9_intra4x4_predict(BLOCKD *x,
    }
    break;
    case B_RD_PRED: {
+      uint8_t pp[9];

-      unsigned char pp[9];
-
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
+      pp[0] = left[3];
+      pp[1] = left[2];
+      pp[2] = left[1];
+      pp[3] = left[0];
      pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
+      pp[5] = above[0];
+      pp[6] = above[1];
+      pp[7] = above[2];
+      pp[8] = above[3];

      predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
      predictor[3 * 16 + 1] =
@ -303,19 +292,17 @@ void vp9_intra4x4_predict(BLOCKD *x,
    }
    break;
    case B_VR_PRED: {
+      uint8_t pp[9];

-      unsigned char pp[9];
-
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
+      pp[0] = left[3];
+      pp[1] = left[2];
+      pp[2] = left[1];
+      pp[3] = left[0];
      pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
-
+      pp[5] = above[0];
+      pp[6] = above[1];
+      pp[7] = above[2];
+      pp[8] = above[3];

      predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
      predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
@ -337,8 +324,7 @@ void vp9_intra4x4_predict(BLOCKD *x,
    }
    break;
    case B_VL_PRED: {
-
-      unsigned char *pp = Above;
+      uint8_t *pp = above;

      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
      predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
@ -360,16 +346,17 @@ void vp9_intra4x4_predict(BLOCKD *x,
    break;

    case B_HD_PRED: {
-      unsigned char pp[9];
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
+      uint8_t pp[9];
+
+      pp[0] = left[3];
+      pp[1] = left[2];
+      pp[2] = left[1];
+      pp[3] = left[0];
      pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
+      pp[5] = above[0];
+      pp[6] = above[1];
+      pp[7] = above[2];
+      pp[8] = above[3];


      predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
@ -393,7 +380,7 @@ void vp9_intra4x4_predict(BLOCKD *x,


    case B_HU_PRED: {
-      unsigned char *pp = Left;
+      uint8_t *pp = left;
      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
      predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
      predictor[0 * 16 + 2] =
@ -418,48 +405,38 @@ void vp9_intra4x4_predict(BLOCKD *x,
    break;
    /*
    case B_CORNER_PRED:
-    corner_predictor(predictor, 16, 4, Above, Left);
+    corner_predictor(predictor, 16, 4, above, left);
    break;
    */
 #endif
  }
 }

-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra4x4_predict_c(BLOCKD *x,
-                               int b_mode, int b_mode2,
-                               unsigned char *out_predictor) {
-  unsigned char predictor[2][4 * 16];
-  int i, j;
-
-  vp9_intra4x4_predict(x, b_mode, predictor[0]);
-  vp9_intra4x4_predict(x, b_mode2, predictor[1]);
-
-  for (i = 0; i < 16 * 4; i += 16) {
-    for (j = i; j < i + 4; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
 * to the right prediction have filled in pixels to use.
 */
 void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
-  int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);
-  unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
+  int extend_edge = xd->mb_to_right_edge == 0 && xd->mb_index < 2;
+  uint8_t *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
                               xd->block[0].dst_stride + 16;
-  unsigned int *src_ptr = (unsigned int *)
-      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
+  uint32_t *dst_ptr0 = (uint32_t *)above_right;
+  uint32_t *dst_ptr1 =
+    (uint32_t *)(above_right + 4 * xd->block[0].dst_stride);
+  uint32_t *dst_ptr2 =
+    (uint32_t *)(above_right + 8 * xd->block[0].dst_stride);
+  uint32_t *dst_ptr3 =
+    (uint32_t *)(above_right + 12 * xd->block[0].dst_stride);

-  unsigned int *dst_ptr0 = (unsigned int *)above_right;
-  unsigned int *dst_ptr1 =
-    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
-  unsigned int *dst_ptr2 =
-    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
-  unsigned int *dst_ptr3 =
-    (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);
+  uint32_t *src_ptr = (uint32_t *) above_right;
+
+  if ((xd->sb_index >= 2 && xd->mb_to_right_edge == 0) ||
+      (xd->sb_index == 3 && xd->mb_index & 1))
+    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 32 *
+                                                    xd->block[0].dst_stride);
+  if (xd->mb_index == 3 ||
+      (xd->mb_to_right_edge == 0 && xd->mb_index == 2))
+    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 16 *
+                                                    xd->block[0].dst_stride);

  if (extend_edge) {
    *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
--- a/vp9/common/vp9_reconintra4x4.h
+++ b/vp9/common/vp9_reconintra4x4.h
@ -14,4 +14,4 @@

 extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);

-#endif
+#endif  // VP9_COMMON_VP9_RECONINTRA4X4_H_
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -4,6 +4,8 @@ cat <<EOF
 * VP9
 */

+#include "vpx/vpx_integer.h"
+
 struct loop_filter_info;
 struct blockd;
 struct macroblockd;
@ -21,10 +23,10 @@ EOF
 }
 forward_decls vp9_common_forward_decls

-prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

 # At the very least, MSVC 2008 has compiler bug exhibited by this code; code
 # compiles warning free but a dissassembly of generated code show bugs. To be
@ -45,70 +47,76 @@ specialize vp9_dequantize_b
 prototype void vp9_dequantize_b_2x2 "struct blockd *x"
 specialize vp9_dequantize_b_2x2

-prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, const short *dc, struct macroblockd *xd"
+prototype void vp9_dequant_dc_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dc, struct macroblockd *xd"
 specialize vp9_dequant_dc_idct_add_y_block_8x8

-prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, struct macroblockd *xd"
+prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_y_block_8x8

-prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs, struct macroblockd *xd"
+prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_uv_block_8x8

-prototype void vp9_dequant_idct_add_16x16 "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
+prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
 specialize vp9_dequant_idct_add_16x16

-prototype void vp9_dequant_idct_add_8x8 "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
+prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc, int eob"
 specialize vp9_dequant_idct_add_8x8

-prototype void vp9_dequant_idct_add "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride"
 specialize vp9_dequant_idct_add

-prototype void vp9_dequant_dc_idct_add "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
+prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc"
 specialize vp9_dequant_dc_idct_add

-prototype void vp9_dequant_dc_idct_add_y_block "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, const short *dc"
+prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs"
 specialize vp9_dequant_dc_idct_add_y_block

-prototype void vp9_dequant_idct_add_y_block "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs"
+prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"
 specialize vp9_dequant_idct_add_y_block

-prototype void vp9_dequant_idct_add_uv_block "short *q, const short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs"
+prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
 specialize vp9_dequant_idct_add_uv_block

+prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob"
+specialize vp9_dequant_idct_add_32x32
+
+prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
+specialize vp9_dequant_idct_add_uv_block_16x16
+
 #
 # RECON
 #
-prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem16x16 mmx sse2 dspr2
 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2

-prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x8 mmx dspr2
 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2

-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx

-prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_avg_mem16x16

-prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_avg_mem8x8

-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx dspr2
 vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2

-prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b

-prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_uv_b

-prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon2b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon2b sse2

-prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon4b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon4b sse2

 prototype void vp9_recon_mb "struct macroblockd *x"
@ -117,12 +125,18 @@ specialize vp9_recon_mb
 prototype void vp9_recon_mby "struct macroblockd *x"
 specialize vp9_recon_mby

-prototype void vp9_recon_mby_s "struct macroblockd *x, unsigned char *dst"
+prototype void vp9_recon_mby_s "struct macroblockd *x, uint8_t *dst"
 specialize vp9_recon_mby_s

-prototype void vp9_recon_mbuv_s "struct macroblockd *x, unsigned char *udst, unsigned char *vdst"
+prototype void vp9_recon_mbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
 specialize void vp9_recon_mbuv_s

+prototype void vp9_recon_sby_s "struct macroblockd *x, uint8_t *dst"
+specialize vp9_recon_sby_s
+
+prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
+specialize void vp9_recon_sbuv_s
+
 prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mby_s

@ -135,9 +149,6 @@ specialize vp9_build_intra_predictors_sbuv_s;
 prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mby;

-prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mby;
-
 prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mby_s;

@ -147,262 +158,283 @@ specialize vp9_build_intra_predictors_mbuv;
 prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mbuv_s;

-prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mbuv;
+prototype void vp9_build_intra_predictors_sb64y_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sb64y_s;

-prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sb64uv_s;
+
+prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra4x4_predict;

-prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra4x4_predict;
-
-prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra8x8_predict;

-prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra8x8_predict;
-
-prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra_uv4x4_predict;

-prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra_uv4x4_predict;
-
 #
 # Loopfilter
 #
-prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_mbv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_mbv sse2

-prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bv sse2

-prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bv8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bv8x8 sse2

-prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_mbh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_mbh sse2

-prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bh sse2

-prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bh8x8 sse2

-prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_mbv mmx sse2
 vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
 vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
 vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2

-prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_mbh mmx sse2
 vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
 vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
 vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2

-prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_bv mmx sse2
 vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
 vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
 vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2

-prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_bh mmx sse2
 vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
 vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
 vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2

+prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_lpf_mbh_w sse2
+
+prototype void vp9_lpf_mbv_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_lpf_mbv_w sse2
+
 #
 # post proc
 #
 if [ "$CONFIG_POSTPROC" = "yes" ]; then
-prototype void vp9_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols, int flimit"
+prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_down mmx sse2
 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm

-prototype void vp9_mbpost_proc_across_ip "unsigned char *src, int pitch, int rows, int cols, int flimit"
+prototype void vp9_mbpost_proc_across_ip "uint8_t *src, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_across_ip sse2
 vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm

-prototype void vp9_post_proc_down_and_across "unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
+prototype void vp9_post_proc_down_and_across "uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
 specialize vp9_post_proc_down_and_across mmx sse2
 vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm

-prototype void vp9_plane_add_noise "unsigned char *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
+prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
 specialize vp9_plane_add_noise mmx sse2
 vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt
 fi

-prototype void vp9_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_mb_inner

-prototype void vp9_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_mb_outer

-prototype void vp9_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_b

 #
 # sad 16x3, 3x16
 #
-prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride"
+prototype unsigned int vp9_sad16x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
 specialize vp9_sad16x3 sse2

-prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride"
+prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
 specialize vp9_sad3x16 sse2

+prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x2 sse2
+
 #
 # Sub Pixel Filters
 #
-prototype void vp9_eighttap_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict16x16

-prototype void vp9_eighttap_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x8

-prototype void vp9_eighttap_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg16x16

-prototype void vp9_eighttap_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg8x8

-prototype void vp9_eighttap_predict_avg4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg4x4

-prototype void vp9_eighttap_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x4

-prototype void vp9_eighttap_predict "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict
+prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict4x4

-prototype void vp9_eighttap_predict16x16_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict16x16_sharp

-prototype void vp9_eighttap_predict8x8_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x8_sharp

-prototype void vp9_eighttap_predict_avg16x16_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg16x16_sharp

-prototype void vp9_eighttap_predict_avg8x8_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg8x8_sharp

-prototype void vp9_eighttap_predict_avg4x4_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg4x4_sharp

-prototype void vp9_eighttap_predict8x4_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x4_sharp

-prototype void vp9_eighttap_predict_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_sharp
+prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict4x4_sharp

-prototype void vp9_sixtap_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict16x16_smooth
+
+prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x8_smooth
+
+prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg16x16_smooth
+
+prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg8x8_smooth
+
+prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg4x4_smooth
+
+prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x4_smooth
+
+prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict4x4_smooth
+
+prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict16x16

-prototype void vp9_sixtap_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict8x8

-prototype void vp9_sixtap_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict_avg16x16

-prototype void vp9_sixtap_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict_avg8x8

-prototype void vp9_sixtap_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict8x4

-prototype void vp9_sixtap_predict "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict
+prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict4x4

-prototype void vp9_sixtap_predict_avg "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg
+prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict_avg4x4

-prototype void vp9_bilinear_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict16x16 mmx sse2
+prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict16x16 sse2

-prototype void vp9_bilinear_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x8 mmx sse2
+prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict8x8 sse2

-prototype void vp9_bilinear_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_bilinear_predict_avg16x16

-prototype void vp9_bilinear_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_bilinear_predict_avg8x8

-prototype void vp9_bilinear_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x4 mmx
+prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict8x4

-prototype void vp9_bilinear_predict4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict4x4 mmx
+prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict4x4

-prototype void vp9_bilinear_predict_avg4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_bilinear_predict_avg4x4

 #
 # dct
 #
-prototype void vp9_short_idct4x4llm_1 "short *input, short *output, int pitch"
+prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct4x4llm_1

-prototype void vp9_short_idct4x4llm "short *input, short *output, int pitch"
+prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct4x4llm

-prototype void vp9_short_idct8x8 "short *input, short *output, int pitch"
+prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct8x8

-prototype void vp9_short_idct10_8x8 "short *input, short *output, int pitch"
+prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct10_8x8

-prototype void vp9_short_ihaar2x2 "short *input, short *output, int pitch"
+prototype void vp9_short_ihaar2x2 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_ihaar2x2

-prototype void vp9_short_idct16x16 "short *input, short *output, int pitch"
+prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct16x16

-prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch"
+prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct10_16x16

-prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim, short eobs"
+prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_idct32x32
+
+prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
 specialize vp9_ihtllm

 #
 # 2nd order
 #
-prototype void vp9_short_inv_walsh4x4_1 "short *in, short *out"
+prototype void vp9_short_inv_walsh4x4_1 "int16_t *in, int16_t *out"
 specialize vp9_short_inv_walsh4x4_1

-prototype void vp9_short_inv_walsh4x4 "short *in, short *out"
+prototype void vp9_short_inv_walsh4x4 "int16_t *in, int16_t *out"
 specialize vp9_short_inv_walsh4x4_


 # dct and add
-prototype void vp9_dc_only_idct_add_8x8 "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
+prototype void vp9_dc_only_idct_add_8x8 "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add_8x8

-prototype void vp9_dc_only_idct_add "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
+prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add

 if [ "$CONFIG_LOSSLESS" = "yes" ]; then
-prototype void vp9_short_inv_walsh4x4_1_x8 "short *input, short *output, int pitch"
-prototype void vp9_short_inv_walsh4x4_x8 "short *input, short *output, int pitch"
-prototype void vp9_dc_only_inv_walsh_add "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
-prototype void vp9_short_inv_walsh4x4_1_lossless "short *in, short *out"
-prototype void vp9_short_inv_walsh4x4_lossless "short *in, short *out"
+prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
+prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"
+prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
+prototype void vp9_short_inv_walsh4x4_1_lossless "int16_t *in, int16_t *out"
+prototype void vp9_short_inv_walsh4x4_lossless "int16_t *in, int16_t *out"
 fi

-
-
-if [ "$CONFIG_SUPERBLOCKS" = "yes" ]; then
-
-prototype unsigned int vp9_sad32x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3

-prototype unsigned int vp9_sad3x32 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp9_sad3x32 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad3x32

-fi
-
 #
 # Encoder functions below this point.
 #
@ -412,154 +444,181 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
 # variance
 [ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2

-prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance32x32

-prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance64x64
+
+prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance16x16 mmx sse2
 vp9_variance16x16_sse2=vp9_variance16x16_wmt
 vp9_variance16x16_mmx=vp9_variance16x16_mmx

-prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance16x8 mmx sse2
 vp9_variance16x8_sse2=vp9_variance16x8_wmt
 vp9_variance16x8_mmx=vp9_variance16x8_mmx

-prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x16 mmx sse2
 vp9_variance8x16_sse2=vp9_variance8x16_wmt
 vp9_variance8x16_mmx=vp9_variance8x16_mmx

-prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x8 mmx sse2
 vp9_variance8x8_sse2=vp9_variance8x8_wmt
 vp9_variance8x8_mmx=vp9_variance8x8_mmx

-prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance4x4 mmx sse2
 vp9_variance4x4_sse2=vp9_variance4x4_wmt
 vp9_variance4x4_mmx=vp9_variance4x4_mmx

-prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance64x64
+
+prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32

-prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt

-prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt

-prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt

-prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt

-prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

-prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad64x64
+
+prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad32x32

-prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad16x16 mmx sse2 sse3
 vp9_sad16x16_sse2=vp9_sad16x16_wmt

-prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad16x8 mmx sse2
 vp9_sad16x8_sse2=vp9_sad16x8_wmt

-prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x16 mmx sse2
 vp9_sad8x16_sse2=vp9_sad8x16_wmt

-prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x8 mmx sse2
 vp9_sad8x8_sse2=vp9_sad8x8_wmt

-prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad4x4 mmx sse2
 vp9_sad4x4_sse2=vp9_sad4x4_wmt

-prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_h mmx sse2
 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt

-prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_v mmx sse2
 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt

-prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_hv mmx sse2
 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt

-prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_h
+
+prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_v
+
+prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_hv
+
+prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_h

-prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar32x32_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_v

-prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_hv

-prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad64x64x3
+
+prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x3

-prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x16x3 sse3 ssse3

-prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x8x3 sse3 ssse3

-prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x16x3 sse3

-prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x3 sse3

-prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x3 sse3

-prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+specialize vp9_sad64x64x8
+
+prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad32x32x8

-prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad16x16x8 sse4

-prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad16x8x8 sse4

-prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad8x16x8 sse4

-prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad8x8x8 sse4

-prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad4x4x8 sse4

-prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad64x64x4d
+
+prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x4d

-prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x16x4d sse3

-prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x8x4d sse3

-prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x16x4d sse3

-prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x4d sse3

-prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse3

 #
@ -567,30 +626,33 @@ specialize vp9_sad4x4x4d sse3
 #
 case $arch in
    x86*)
-    prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+    prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n"
    specialize vp9_copy32xn sse2 sse3
    ;;
 esac

-prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
 vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt

-prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"
+prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
 specialize vp9_mse16x16 mmx sse2
 vp9_mse16x16_sse2=vp9_mse16x16_wmt

-prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_mse64x64
+
+prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_mse32x32

-prototype unsigned int vp9_get_mb_ss "const short *"
+prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
 specialize vp9_mbblock_error mmx sse2
 vp9_mbblock_error_sse2=vp9_mbblock_error_xmm

-prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"
+prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
 specialize vp9_block_error mmx sse2
 vp9_block_error_sse2=vp9_block_error_xmm

@ -604,10 +666,10 @@ vp9_mbuverror_sse2=vp9_mbuverror_xmm
 prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
 specialize vp9_subtract_b mmx sse2

-prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"
+prototype void vp9_subtract_mby "int16_t *diff, uint8_t *src, uint8_t *pred, int stride"
 specialize vp9_subtract_mby mmx sse2

-prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"
+prototype void vp9_subtract_mbuv "int16_t *diff, uint8_t *usrc, uint8_t *vsrc, uint8_t *pred, int stride"
 specialize vp9_subtract_mbuv mmx sse2

 #
@ -616,42 +678,45 @@ specialize vp9_subtract_mbuv mmx sse2
 if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2

-    prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64

-    prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
 fi

 # fdct functions
-prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"
+prototype void vp9_fht "const int16_t *input, int pitch, int16_t *output, int tx_type, int tx_dim"
 specialize vp9_fht

-prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x8

-prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fhaar2x2 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fhaar2x2

-prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct4x4

-prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x4

-prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4

-prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_fdct32x32
+
+prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct16x16

-prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh4x4_lossless "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4_lossless

-prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4_x8

-prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh8x4_x8

 #
@ -670,7 +735,7 @@ prototype int vp9_diamond_search_sad "struct macroblock *x, struct block *b, str
 specialize vp9_diamond_search_sad sse3
 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4

-prototype void vp9_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
+prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
 specialize vp9_temporal_filter_apply sse2

 prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
--- a/vp9/common/vp9_sadmxn.h
+++ b/vp9/common/vp9_sadmxn.h
@ -11,14 +11,14 @@
 #ifndef VP9_COMMON_VP9_SADMXN_H_
 #define VP9_COMMON_VP9_SADMXN_H_

-static __inline
-unsigned int sad_mx_n_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int m,
-  int n) {
+#include "vpx/vpx_integer.h"
+
+static __inline unsigned int sad_mx_n_c(const uint8_t *src_ptr,
+                                        int src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int ref_stride,
+                                        int m,
+                                        int n) {
  int r, c;
  unsigned int sad = 0;

@ -34,4 +34,4 @@ unsigned int sad_mx_n_c(
  return sad;
 }

-#endif
+#endif  // VP9_COMMON_VP9_SADMXN_H_
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@ -14,7 +14,7 @@

 static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
 static const int seg_feature_data_max[SEG_LVL_MAX] =
-                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX - 1};
+                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1};

 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"

@ -60,5 +59,5 @@ int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);

 int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);

-#endif /* __INC_SEG_COMMON_H__ */
+#endif  // VP9_COMMON_VP9_SEG_COMMON_H_

--- a/vp9/common/vp9_setupintrarecon.c
+++ b/vp9/common/vp9_setupintrarecon.c
@ -18,14 +18,14 @@ void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
  /* set up frame new frame for intra coded blocks */
  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
  for (i = 0; i < ybf->y_height; i++)
-    ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;
+    ybf->y_buffer[ybf->y_stride * i - 1] = (uint8_t) 129;

  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
  for (i = 0; i < ybf->uv_height; i++)
-    ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+    ybf->u_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;

  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
  for (i = 0; i < ybf->uv_height; i++)
-    ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+    ybf->v_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;

 }
--- a/vp9/common/vp9_setupintrarecon.h
+++ b/vp9/common/vp9_setupintrarecon.h
@ -8,6 +8,11 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#ifndef VP9_COMMON_VP9_SETUPINTRARECON_H_
+#define VP9_COMMON_VP9_SETUPINTRARECON_H_

 #include "vpx_scale/yv12config.h"
+
 extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+
+#endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_
--- a/vp9/common/vp9_subpelvar.h
+++ b/vp9/common/vp9_subpelvar.h
@ -8,14 +8,14 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#ifndef VP9_COMMON_VP9_SUBPELVAR_H_
+#define VP9_COMMON_VP9_SUBPELVAR_H_

 #include "vp9/common/vp9_filter.h"

-
-
-static void variance(const unsigned char *src_ptr,
+static void variance(const uint8_t *src_ptr,
                     int  source_stride,
-                     const unsigned char *ref_ptr,
+                     const uint8_t *ref_ptr,
                     int  recon_stride,
                     int  w,
                     int  h,
@ -43,14 +43,14 @@ static void variance(const unsigned char *src_ptr,
 *
 *  ROUTINE       : filter_block2d_bil_first_pass
 *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
@ -59,7 +59,7 @@ static void variance(const unsigned char *src_ptr,
 *                  filtered output block. Used to implement first-pass
 *                  of 2-D separable filter.
 *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
 *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
 *                  pixel_step defines whether the filter is applied
 *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
@ -67,13 +67,13 @@ static void variance(const unsigned char *src_ptr,
 *                  to the next.
 *
 ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
-                                              unsigned short *output_ptr,
+static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
+                                              uint16_t *output_ptr,
                                              unsigned int src_pixels_per_line,
                                              int pixel_step,
                                              unsigned int output_height,
                                              unsigned int output_width,
-                                              const short *vp9_filter) {
+                                              const int16_t *vp9_filter) {
  unsigned int i, j;

  for (i = 0; i < output_height; i++) {
@ -95,14 +95,14 @@ static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
 *
 *  ROUTINE       : filter_block2d_bil_second_pass
 *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
@ -119,13 +119,13 @@ static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
 *                  to the next.
 *
 ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,
-                                               unsigned char *output_ptr,
+static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
+                                               uint8_t *output_ptr,
                                               unsigned int src_pixels_per_line,
                                               unsigned int pixel_step,
                                               unsigned int output_height,
                                               unsigned int output_width,
-                                               const short *vp9_filter) {
+                                               const int16_t *vp9_filter) {
  unsigned int  i, j;
  int  Temp;

@ -145,3 +145,4 @@ static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,
  }
 }

+#endif  // VP9_COMMON_VP9_SUBPELVAR_H_
--- a/vp9/common/vp9_subpixel.h
+++ b/vp9/common/vp9_subpixel.h
@ -8,14 +8,13 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_SUBPIXEL_H_
 #define VP9_COMMON_VP9_SUBPIXEL_H_

 #define prototype_subpixel_predict(sym) \
-  void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
-           unsigned char *dst, int dst_pitch)
+  void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \
+           uint8_t *dst, int dst_pitch)

 typedef prototype_subpixel_predict((*vp9_subpix_fn_t));

-#endif
+#endif  // VP9_COMMON_VP9_SUBPIXEL_H_
--- a/vp9/common/vp9_swapyv12buffer.c
+++ b/vp9/common/vp9_swapyv12buffer.c
@ -12,7 +12,7 @@

 void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
                          YV12_BUFFER_CONFIG *last_frame) {
-  unsigned char *temp;
+  uint8_t *temp;

  temp = last_frame->buffer_alloc;
  last_frame->buffer_alloc = new_frame->buffer_alloc;
--- a/vp9/common/vp9_swapyv12buffer.h
+++ b/vp9/common/vp9_swapyv12buffer.h
@ -16,4 +16,4 @@
 void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
                          YV12_BUFFER_CONFIG *last_frame);

-#endif  // __SWAPYV12_BUFFER_H
+#endif  // VP9_COMMON_VP9_SWAPYV12BUFFER_H_
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@ -7,9 +7,14 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+
 #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_

+#ifdef _MSC_VER
+#include <math.h>
+#endif
+
 #include "./vpx_config.h"
 #if ARCH_X86 || ARCH_X86_64
 void vpx_reset_mmx_state(void);
@ -18,6 +23,17 @@ void vpx_reset_mmx_state(void);
 #define vp9_clear_system_state()
 #endif

+#ifdef _MSC_VER
+// round is not defined in MSVC
+static int round(double x) {
+  if (x < 0)
+    return (int)ceil(x - 0.5);
+  else
+    return (int)floor(x + 0.5);
+}
+#endif
+
 struct VP9Common;
 void vp9_machine_specific_config(struct VP9Common *);
-#endif
+
+#endif  // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
--- a/vp9/common/vp9_textblit.h
+++ b/vp9/common/vp9_textblit.h
@ -16,4 +16,4 @@ extern void vp9_blit_text(const char *msg, unsigned char *address,
 extern void vp9_blit_line(int x0, int x1, int y0, int y1,
                          unsigned char *image, const int pitch);

-#endif  // __INC_TEXTBLIT_H
+#endif  // VP9_COMMON_VP9_TEXTBLIT_H_
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@ -100,9 +100,7 @@ void vp9_tree_probs_from_distribution(
  vp9_tree tree,
  vp9_prob probs          [ /* n-1 */ ],
  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int Pfac,
-  int rd
+  const unsigned int num_events[ /* n */ ]
 ) {
  const int tree_len = n - 1;
  int t = 0;
@ -110,29 +108,6 @@ void vp9_tree_probs_from_distribution(
  branch_counts(n, tok, tree, branch_ct, num_events);

  do {
-    const unsigned int *const c = branch_ct[t];
-    const unsigned int tot = c[0] + c[1];
-
-#if CONFIG_DEBUG
-    assert(tot < (1 << 24));        /* no overflow below */
-#endif
-
-    if (tot) {
-      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
-      probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
-    } else
-      probs[t] = vp9_prob_half;
+    probs[t] = get_binary_prob(branch_ct[t][0], branch_ct[t][1]);
  } while (++t < tree_len);
 }
-
-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {
-  int tot_count = counts[0] + counts[1];
-  vp9_prob prob;
-  if (tot_count) {
-    prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;
-    prob += !prob;
-  } else {
-    prob = 128;
-  }
-  return prob;
-}
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@ -8,29 +8,18 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_TREECODER_H_
 #define VP9_COMMON_VP9_TREECODER_H_

-typedef unsigned char vp9_prob;
+#include "vpx/vpx_integer.h"

-#define vp9_prob_half ( (vp9_prob) 128)
+typedef uint8_t vp9_prob;

-typedef signed char vp9_tree_index;
-struct bool_coder_spec;
+#define vp9_prob_half ((vp9_prob) 128)

-typedef struct bool_coder_spec bool_coder_spec;
-typedef struct bool_writer bool_writer;
-typedef struct bool_reader bool_reader;
-
-typedef const bool_coder_spec c_bool_coder_spec;
-typedef const bool_writer c_bool_writer;
-typedef const bool_reader c_bool_reader;
-
-
-
-# define vp9_complement( x) (255 - x)
+typedef int8_t vp9_tree_index;

+#define vp9_complement(x) (255 - x)

 /* We build coding trees compactly in arrays.
   Each node of the tree is a pair of vp9_tree_indices.
@ -41,7 +30,6 @@ typedef const bool_reader c_bool_reader;

 typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;

-
 typedef const struct vp9_token_struct {
  int value;
  int Len;
@ -53,31 +41,33 @@ void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);
 void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,
                                 int offset);

-
 /* Convert array of token occurrence counts into a table of probabilities
   for the associated binary encoding tree.  Also writes count of branches
   taken for each node on the tree; this facilitiates decisions as to
   probability updates. */

-void vp9_tree_probs_from_distribution(
-  int n,                      /* n = size of alphabet */
-  vp9_token tok               [ /* n */ ],
-  vp9_tree tree,
-  vp9_prob probs          [ /* n-1 */ ],
-  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int Pfactor,
-  int Round
-);
+void vp9_tree_probs_from_distribution(int n,  /* n = size of alphabet */
+                                      vp9_token tok[ /* n */ ],
+                                      vp9_tree tree,
+                                      vp9_prob probs[ /* n - 1 */ ],
+                                      unsigned int branch_ct[ /* n - 1 */ ][2],
+                                      const unsigned int num_events[ /* n */ ]);

-static __inline int clip_prob(int p) {
-  if (p > 255)
-    return 255;
-  else if (p < 1)
-    return 1;
-  return p;
+static __inline vp9_prob clip_prob(int p) {
+  return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }

-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);
+static __inline vp9_prob get_prob(int num, int den) {
+  return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den);
+}

-#endif
+static __inline vp9_prob get_binary_prob(int n0, int n1) {
+  return get_prob(n0, n0 + n1);
+}
+
+/* this function assumes prob1 and prob2 are already within [1,255] range */
+static __inline vp9_prob weighted_prob(int prob1, int prob2, int factor) {
+  return (prob1 * (256 - factor) + prob2 * factor + 128) >> 8;
+}
+
+#endif  // VP9_COMMON_VP9_TREECODER_H_
--- a/vp9/common/vp9_type_aliases.h
+++ b/vp9/common/vp9_type_aliases.h
@ -1,122 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     vp9_type_aliases.h
-*
-*   Description  :     Standard type aliases
-*
-****************************************************************************/
-#ifndef VP9_COMMON_VP9_TYPE_ALIASES_H_
-#define VP9_COMMON_VP9_TYPE_ALIASES_H_
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT          extern      /* Used to declare imported data & routines */
-#define PRIVATE         static      /* Used to declare & define module-local data */
-#define LOCAL           static      /* Used to define all persistent routine-local data */
-#define STD_IN_PATH     0           /* Standard input path */
-#define STD_OUT_PATH    1           /* Standard output path */
-#define STD_ERR_PATH    2           /* Standard error path */
-#define STD_IN_FILE     stdin       /* Standard input file pointer */
-#define STD_OUT_FILE    stdout      /* Standard output file pointer */
-#define STD_ERR_FILE    stderr      /* Standard error file pointer */
-#define max_int         0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL    0
-#else
-#define NULL    ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE   0
-#endif
-
-#ifndef TRUE
-#define TRUE    1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char     INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short    INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int      INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char   UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int    UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short  UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int             BOOL;
-#endif
-
-typedef unsigned char   BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#if _MSC_VER < 1600
-#ifndef INT64_MAX
-#define INT64_MAX LLONG_MAX
-#endif
-#endif
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef  double         FLOAT64;
-typedef  float          FLOAT32;
-
-#endif
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@ -15,8 +15,6 @@

 extern const short vp9_six_tap_mmx[8][6 * 8];

-extern const short vp9_bilinear_filters_8x_mmx[8][2 * 8];
-
 extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
                                      unsigned short  *output_ptr,
                                      unsigned int     src_pixels_per_line,
@ -95,8 +93,6 @@ extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
                                             unsigned int   output_height,
                                             const short   *vp9_filter);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-
 ///////////////////////////////////////////////////////////////////////////
 // the mmx function that does the bilinear filtering and var calculation //
 // int one pass                                                          //
@ -232,26 +228,6 @@ void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,
  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
                             16, 8, 4, 8, vfilter);
 }
-
-void vp9_bilinear_predict16x16_mmx(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  vp9_bilinear_predict8x8_mmx(src_ptr,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + 8, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8 + 8, dst_pitch);
-}
 #endif

 #if HAVE_SSE2
--- a/vp9/common/x86/vp9_loopfilter_x86.c
+++ b/vp9/common/x86/vp9_loopfilter_x86.c
@ -85,6 +85,480 @@ void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
 #endif

 #if HAVE_SSE2
+
+void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
+                                       int p,
+                                       const unsigned char *_blimit,
+                                       const unsigned char *_limit,
+                                       const unsigned char *_thresh) {
+  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
+  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
+
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  __m128i mask, hev, flat, flat2;
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p7, p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q5, q6, q7;
+  int i = 0;
+  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
+  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
+  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
+  const __m128i thresh =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
+  const __m128i limit =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
+  const __m128i blimit =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
+
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                     _mm_subs_epu8(p0, p4)),
+                         _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                      _mm_subs_epu8(q0, q4)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // calculate flat2
+  p4 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+//  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+//  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i work;
+    flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                     _mm_subs_epu8(p0, p4)),
+                         _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                      _mm_subs_epu8(q0, q4)));
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+  // calculate flat2
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    i = 0;
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < 2);
+  }
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // wide flat
+  // TODO(slavarnway): interleave with the flat pixel calculations (see above)
+  {
+    const __m128i eight = _mm_set1_epi16(8);
+    unsigned char *src = s;
+    int i = 0;
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero);
+      p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero);
+      p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero);
+      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
+      q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero);
+      q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero);
+      q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero);
+
+
+      workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
+      workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a);
+      workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2));
+      workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < 2);
+  }
+  // wide flat
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    // write out op6 - op3
+    {
+      unsigned char *dst = (s - 7 * p);
+      for (i = 6; i > 2; i--) {
+        __m128i flat2_output;
+        work_a = _mm_loadu_si128((__m128i *)dst);
+        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
+        work_a = _mm_andnot_si128(flat2, work_a);
+        flat2_output = _mm_and_si128(flat2, flat2_output);
+        work_a = _mm_or_si128(work_a, flat2_output);
+        _mm_storeu_si128((__m128i *)dst, work_a);
+        dst += p;
+      }
+    }
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_load_si128((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    work_a = _mm_or_si128(work_a, p2);
+    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p2 = _mm_and_si128(flat2, p2);
+    p2 = _mm_or_si128(work_a, p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+    p1 = _mm_load_si128((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, ps1);
+    p1 = _mm_and_si128(flat, p1);
+    work_a = _mm_or_si128(work_a, p1);
+    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p1 = _mm_and_si128(flat2, p1);
+    p1 = _mm_or_si128(work_a, p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+    p0 = _mm_load_si128((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, ps0);
+    p0 = _mm_and_si128(flat, p0);
+    work_a = _mm_or_si128(work_a, p0);
+    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p0 = _mm_and_si128(flat2, p0);
+    p0 = _mm_or_si128(work_a, p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, qs0);
+    q0 = _mm_and_si128(flat, q0);
+    work_a = _mm_or_si128(work_a, q0);
+    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q0 = _mm_and_si128(flat2, q0);
+    q0 = _mm_or_si128(work_a, q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, qs1);
+    q1 = _mm_and_si128(flat, q1);
+    work_a = _mm_or_si128(work_a, q1);
+    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q1 = _mm_and_si128(flat2, q1);
+    q1 = _mm_or_si128(work_a, q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    work_a = _mm_or_si128(work_a, q2);
+    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q2 = _mm_and_si128(flat2, q2);
+    q2 = _mm_or_si128(work_a, q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+    // write out oq3 - oq7
+    {
+      unsigned char *dst = (s + 3 * p);
+      for (i = 3; i < 7; i++) {
+        __m128i flat2_output;
+        work_a = _mm_loadu_si128((__m128i *)dst);
+        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
+        work_a = _mm_andnot_si128(flat2, work_a);
+        flat2_output = _mm_and_si128(flat2, flat2_output);
+        work_a = _mm_or_si128(work_a, flat2_output);
+        _mm_storeu_si128((__m128i *)dst, work_a);
+        dst += p;
+      }
+    }
+  }
+}
+
 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
                                            int p,
                                            const unsigned char *_blimit,
@ -562,6 +1036,38 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
  transpose(src, 16, dst, p, 2);
 }

+void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
+                                          int p,
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+  unsigned char *src[4];
+  unsigned char *dst[4];
+
+  /* Transpose 16x16 */
+  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
+  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
+
+  /* Loop filtering */
+  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
+                                           thresh);
+
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 16;
+  src[2] = t_dst + 8;
+  src[3] = t_dst + 8 * 16 + 8;
+
+  dst[0] = s - 8;
+  dst[1] = s - 8 + 8;
+  dst[2] = s - 8 + p * 8;
+  dst[3] = s - 8 + p * 8 + 8;
+
+  /* Transpose 16x16 */
+  transpose(src, 16, dst, p, 4);
+}
+
+
 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
                                             int p,
                                             const unsigned char *blimit,
@ -604,11 +1110,30 @@ void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,
                                              lfi->lim, lfi->hev_thr, v_ptr);
 }

+
+void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           struct loop_filter_info *lfi) {
+  vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr, y_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr);
+
+  /* u,v */
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
+                                              lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
                             unsigned char *v_ptr, int y_stride, int uv_stride,
                             struct loop_filter_info *lfi) {
  vp9_mbloop_filter_horizontal_edge_sse2(
    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
+                                            lfi->blim, lfi->lim, lfi->hev_thr,
+                                            v_ptr + 4 * uv_stride);
 }

 /* Vertical MB Filtering */
@ -624,11 +1149,30 @@ void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
                                            lfi->lim, lfi->hev_thr, v_ptr);
 }

+
+void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                   unsigned char *v_ptr, int y_stride, int uv_stride,
+                   struct loop_filter_info *lfi) {
+  vp9_mb_lpf_vertical_edge_w_sse2(y_ptr, y_stride,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr);
+
+  /* u,v */
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
+                                            lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
                             unsigned char *v_ptr, int y_stride, int uv_stride,
                             struct loop_filter_info *lfi) {
  vp9_mbloop_filter_vertical_edge_sse2(
    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
+                                          lfi->blim, lfi->lim, lfi->hev_thr,
+                                          v_ptr + 4);
 }

 /* Horizontal B Filtering */
--- a/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm
+++ b/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm
@ -0,0 +1,645 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+;void vp9_filter_block2d_bil_var_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
+sym(vp9_filter_block2d_bil_var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6                 ;
+        pxor            xmm7,           xmm7                 ;
+
+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
+        movdqa          xmm4,           XMMWORD PTR [rsi]
+
+        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_sse2_sp_only
+
+        shl             rax,            5                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_sse2_fp_only
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        movq            xmm3,           QWORD PTR [rsi+1]    ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]                ;
+        punpcklbw       xmm3,           xmm0
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift     ;
+        movdqa          xmm5,           xmm1
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm3,           QWORD PTR [rsi+1]             ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4               ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movdqa          xmm3,           xmm5                 ;
+        movdqa          xmm5,           xmm1                 ;
+
+        pmullw          xmm3,           [rdx]               ;
+        pmullw          xmm1,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_sse2_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movq            xmm3,           QWORD PTR [rsi]             ;
+        punpcklbw       xmm3,           xmm0                 ;
+        movdqa          xmm5,           xmm3
+
+        pmullw          xmm1,           [rdx]               ;
+        pmullw          xmm3,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        movdqa          xmm1,           xmm5                 ;
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_sp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movq            xmm1,           QWORD PTR [rsi]       ;
+        movq            xmm3,           QWORD PTR [rsi+1]     ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4  ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]     ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_fp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(7) ; sum
+        mov             rdi,            arg(8) ; sumsquared
+
+        movd            [rsi],          mm2    ; xsum
+        movd            [rdi],          mm4    ; xxsum
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void vp9_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(vp9_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+.half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_vert_variance16x_h_1    ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
+sym(vp9_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+.half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             .half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
+sym(vp9_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+.half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_variance16x_h_1         ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
+    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ b/vp9/common/x86/vp9_subpixel_mmx.asm
@ -202,438 +202,6 @@ sym(vp9_filter_block1dc_v6_mmx):
    pop         rbp
    ret

-
-;void bilinear_predict8x8_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;   unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_mmx) PRIVATE
-sym(vp9_bilinear_predict8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        shl         rax,        5 ; offset * 32
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-
-        shl         rax,        5 ; offset*32
-        add         rax,        rcx ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]          ;
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x8:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8                  ;dst_pitch
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x8
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict8x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x4_mmx) PRIVATE
-sym(vp9_bilinear_predict8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-        shl         rax,        5
-
-        mov         rsi,        arg(0) ;src_ptr              ;
-        add         rax,        rcx
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x4:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict4x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict4x4_mmx) PRIVATE
-sym(vp9_bilinear_predict4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-        shl         rax,        5
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm0                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_4x4:
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-
-        movq        mm5,        mm7                 ;
-        punpcklbw   mm5,        mm0                 ;
-
-        pmullw      mm5,        [rax]               ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-        movq        mm7,        mm3                 ;
-
-        packuswb    mm7,        mm0                 ;
-
-        pmullw      mm3,        [rax+16]            ;
-        paddw       mm3,        mm5                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    mm3,        mm0
-        movd        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_4x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 SECTION_RODATA
 align 16
 rd:
@ -698,30 +266,3 @@ sym(vp9_six_tap_mmx):
    times 8 dw -6
    times 8 dw 0

-
-align 16
-global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))
-sym(vp9_bilinear_filters_8x_mmx):
-    times 8 dw 128
-    times 8 dw 0
-
-    times 8 dw 112
-    times 8 dw 16
-
-    times 8 dw 96
-    times 8 dw 32
-
-    times 8 dw 80
-    times 8 dw 48
-
-    times 8 dw 64
-    times 8 dw 64
-
-    times 8 dw 48
-    times 8 dw 80
-
-    times 8 dw 32
-    times 8 dw 96
-
-    times 8 dw 16
-    times 8 dw 112
--- a/vp9/common/x86/vp9_subpixel_variance_sse2.c
+++ b/vp9/common/x86/vp9_subpixel_variance_sse2.c
@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define HALFNDX 8
+
+void vp9_half_horiz_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                       int ref_pixels_per_line,
+                                       const unsigned char *src_ptr,
+                                       int src_pixels_per_line,
+                                       unsigned int Height,
+                                       int *sum,
+                                       unsigned int *sumsquared);
+
+void vp9_half_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                      int ref_pixels_per_line,
+                                      const unsigned char *src_ptr,
+                                      int src_pixels_per_line,
+                                      unsigned int Height,
+                                      int *sum,
+                                      unsigned int *sumsquared);
+
+void vp9_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                            int ref_pixels_per_line,
+                                            const unsigned char *src_ptr,
+                                            int src_pixels_per_line,
+                                            unsigned int Height,
+                                            int *sum,
+                                            unsigned int *sumsquared);
+
+void vp9_filter_block2d_bil_var_sse2(const unsigned char *ref_ptr,
+                                     int ref_pixels_per_line,
+                                     const unsigned char *src_ptr,
+                                     int src_pixels_per_line,
+                                     unsigned int Height,
+                                     int  xoffset,
+                                     int  yoffset,
+                                     int *sum,
+                                     unsigned int *sumsquared);
+
+unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char  *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const unsigned char *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse) {
+  int xsum0, xsum1;
+  unsigned int xxsum0, xxsum1;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else {
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      xoffset, yoffset,
+      &xsum0, &xxsum0);
+
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr + 8, src_pixels_per_line,
+      dst_ptr + 8, dst_pixels_per_line, 2,
+      xoffset, yoffset,
+      &xsum1, &xxsum1);
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+  }
+
+  *sse = xxsum0;
+  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5));
+}
--- a/vp9/common/x86/vp9_subpixel_x86.h
+++ b/vp9/common/x86/vp9_subpixel_x86.h
@ -25,10 +25,6 @@ extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
 extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
 extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
 extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);
-

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp9_subpix_sixtap16x16
@ -46,15 +42,6 @@ extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);
 #undef  vp9_subpix_bilinear16x16
 #define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx

-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx
-
-#undef  vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx
-
-#undef  vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx
-
 #endif
 #endif

--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@ -8,9 +8,9 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_DECODER_VP9_DBOOLHUFF_H_
 #define VP9_DECODER_VP9_DBOOLHUFF_H_
+
 #include <stddef.h>
 #include <limits.h>
 #include "./vpx_config.h"
@ -33,7 +33,7 @@ typedef struct {
  unsigned int         range;
 } BOOL_DECODER;

-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);

 int vp9_start_decode(BOOL_DECODER *br,
                     const unsigned char *source,
@ -152,4 +152,4 @@ static int bool_error(BOOL_DECODER *br) {

 extern int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);

-#endif
+#endif  // VP9_DECODER_VP9_DBOOLHUFF_H_
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@ -14,7 +14,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_findnearmv.h"
-
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@ -51,7 +51,6 @@ static int read_ymode(vp9_reader *bc, const vp9_prob *p) {
  return treed_read(bc, vp9_ymode_tree, p);
 }

-#if CONFIG_SUPERBLOCKS
 static int read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
  return treed_read(bc, vp9_sb_ymode_tree, p);
 }
@ -59,7 +58,6 @@ static int read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
 static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
  return treed_read(bc, vp9_uv_mode_tree, p);
 }
-#endif

 static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
  return treed_read(bc, vp9_kf_ymode_tree, p);
@ -122,7 +120,21 @@ static void kfread_modes(VP9D_COMP *pbi,
  m->mbmi.segment_id = 0;
  if (pbi->mb.update_mb_segmentation_map) {
    read_mb_segid(bc, &m->mbmi, &pbi->mb);
-    pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;
+    if (m->mbmi.sb_type) {
+      const int nmbs = 1 << m->mbmi.sb_type;
+      const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+      const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+      int x, y;
+
+      for (y = 0; y < ymbs; y++) {
+        for (x = 0; x < xmbs; x++) {
+          cm->last_frame_seg_map[map_index + x + y * cm->mb_cols] =
+              m->mbmi.segment_id;
+        }
+      }
+    } else {
+      cm->last_frame_seg_map[map_index] = m->mbmi.segment_id;
+    }
  }

  m->mbmi.mb_skip_coeff = 0;
@ -144,25 +156,18 @@ static void kfread_modes(VP9D_COMP *pbi,
      m->mbmi.mb_skip_coeff = 0;
  }

-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb) {
+  if (m->mbmi.sb_type) {
    y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-  } else
-#endif
-  y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
-    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-#if CONFIG_COMP_INTRA_PRED
-  m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
+  } else {
+    y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
+      pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+  }

  m->mbmi.ref_frame = INTRA_FRAME;

  if ((m->mbmi.mode = y_mode) == B_PRED) {
    int i = 0;
-#if CONFIG_COMP_INTRA_PRED
-    int use_comp_pred = vp9_read(bc, DEFAULT_COMP_INTRA_PROB);
-#endif
    do {
      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
      const B_PREDICTION_MODE L = left_block_mode(m, i);
@ -170,15 +175,6 @@ static void kfread_modes(VP9D_COMP *pbi,
      m->bmi[i].as_mode.first =
        (B_PREDICTION_MODE) read_kf_bmode(
          bc, pbi->common.kf_bmode_prob [A] [L]);
-#if CONFIG_COMP_INTRA_PRED
-      if (use_comp_pred) {
-        m->bmi[i].as_mode.second =
-          (B_PREDICTION_MODE) read_kf_bmode(
-            bc, pbi->common.kf_bmode_prob [A] [L]);
-      } else {
-        m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
-      }
-#endif
    } while (++i < 16);
  }
  if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
@ -191,26 +187,22 @@ static void kfread_modes(VP9D_COMP *pbi,
      m->bmi[ib + 1].as_mode.first = mode8x8;
      m->bmi[ib + 4].as_mode.first = mode8x8;
      m->bmi[ib + 5].as_mode.first = mode8x8;
-#if CONFIG_COMP_INTRA_PRED
-      m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
    }
  } else
    m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,
                                                       pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
-#if CONFIG_COMP_INTRA_PRED
-  m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif

  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
      m->mbmi.mode <= I8X8_PRED) {
    // FIXME(rbultje) code ternary symbol once all experiments are merged
    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
+    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {
      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
+      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.sb_type)
+        m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]);
+    }
+  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.sb_type) {
+    m->mbmi.txfm_size = TX_32X32;
  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
    m->mbmi.txfm_size = TX_16X16;
  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
@ -478,11 +470,9 @@ static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,
  return (MV_REFERENCE_FRAME)ref_frame;
 }

-#if CONFIG_SUPERBLOCKS
 static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {
  return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);
 }
-#endif

 static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {
  return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);
@ -532,12 +522,6 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
    if (!cm->kf_ymode_probs_update)
      cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);
  } else {
-#if CONFIG_PRED_FILTER
-    cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);
-
-    if (cm->pred_filter_mode == 2)
-      cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);
-#endif
    if (cm->mcomp_filter_type == SWITCHABLE)
      read_switchable_interp_probs(pbi, bc);
 #if CONFIG_COMP_INTERINTRA_PRED
@ -572,7 +556,6 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
      } while (++i < VP9_YMODES - 1);
    }

-#if CONFIG_SUPERBLOCKS
    if (vp9_read_bit(bc)) {
      int i = 0;

@ -580,12 +563,6 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
        cm->fc.sb_ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
      } while (++i < VP9_I32X32_MODES - 1);
    }
-#endif
-
-#if CONFIG_NEW_MVREF
-  // Temp defaults probabilities for ecnoding the MV ref id signal
-  vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
-#endif

    read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);
  }
@ -633,38 +610,38 @@ static void read_mb_segment_id(VP9D_COMP *pbi,
      else {
        read_mb_segid(bc, mbmi, xd);
      }
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        cm->last_frame_seg_map[index] = mbmi->segment_id;
-        if (mb_col + 1 < cm->mb_cols)
-          cm->last_frame_seg_map[index + 1] = mbmi->segment_id;
-        if (mb_row + 1 < cm->mb_rows) {
-          cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;
-          if (mb_col + 1 < cm->mb_cols)
-            cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+      if (mbmi->sb_type) {
+        const int nmbs = 1 << mbmi->sb_type;
+        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+        int x, y;
+
+        for (y = 0; y < ymbs; y++) {
+          for (x = 0; x < xmbs; x++) {
+            cm->last_frame_seg_map[index + x + y * cm->mb_cols] =
+                mbmi->segment_id;
+          }
        }
-      } else
-#endif
-      {
+      } else {
        cm->last_frame_seg_map[index] = mbmi->segment_id;
      }
    } else {
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        mbmi->segment_id = cm->last_frame_seg_map[index];
-        if (mb_col < cm->mb_cols - 1)
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + 1];
-        if (mb_row < cm->mb_rows - 1) {
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + cm->mb_cols];
-          if (mb_col < cm->mb_cols - 1)
-            mbmi->segment_id = mbmi->segment_id &&
-                               cm->last_frame_seg_map[index + cm->mb_cols + 1];
+      if (mbmi->sb_type) {
+        const int nmbs = 1 << mbmi->sb_type;
+        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+        unsigned segment_id = -1;
+        int x, y;
+
+        for (y = 0; y < ymbs; y++) {
+          for (x = 0; x < xmbs; x++) {
+            segment_id = MIN(segment_id,
+                             cm->last_frame_seg_map[index + x +
+                                                    y * cm->mb_cols]);
+          }
        }
-      } else
-#endif
-      {
+        mbmi->segment_id = segment_id;
+      } else {
        mbmi->segment_id = cm->last_frame_seg_map[index];
      }
    }
@ -689,6 +666,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
  int mb_to_right_edge;
  int mb_to_top_edge;
  int mb_to_bottom_edge;
+  const int mb_size = 1 << mi->mbmi.sb_type;

  mb_to_top_edge = xd->mb_to_top_edge;
  mb_to_bottom_edge = xd->mb_to_bottom_edge;
@ -703,18 +681,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
  xd->mb_to_left_edge =
    mb_to_left_edge = -((mb_col * 16) << 3);
  mb_to_left_edge -= LEFT_TOP_MARGIN;
-
-#if CONFIG_SUPERBLOCKS
-  if (mi->mbmi.encoded_as_sb) {
-    xd->mb_to_right_edge =
-      mb_to_right_edge = ((pbi->common.mb_cols - 2 - mb_col) * 16) << 3;
-  } else {
-#endif
-    xd->mb_to_right_edge =
-      mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-  }
-#endif
+  xd->mb_to_right_edge =
+      mb_to_right_edge = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;
  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;

  // Make sure the MACROBLOCKD mode info pointer is pointed at the
@ -756,10 +724,10 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,

    int recon_y_stride, recon_yoffset;
    int recon_uv_stride, recon_uvoffset;
+    MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

    {
      int ref_fb_idx;
-      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

      /* Select the appropriate reference frame for this MB */
      if (ref_frame == LAST_FRAME)
@ -788,14 +756,33 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                       ref_frame, mbmi->ref_mvs[ref_frame],
                       cm->ref_frame_sign_bias);

-      vp9_find_best_ref_mvs(xd,
-                            xd->pre.y_buffer,
-                            recon_y_stride,
-                            mbmi->ref_mvs[ref_frame],
-                            &best_mv, &nearest, &nearby);
-
      vp9_mv_ref_probs(&pbi->common, mv_ref_p,
                       mbmi->mb_mode_context[ref_frame]);
+
+      // Is the segment level mode feature enabled for this segment
+      if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
+        mbmi->mode =
+          vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
+      } else {
+        if (mbmi->sb_type)
+          mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
+        else
+          mbmi->mode = read_mv_ref(bc, mv_ref_p);
+
+        vp9_accum_mv_refs(&pbi->common, mbmi->mode,
+                          mbmi->mb_mode_context[ref_frame]);
+      }
+
+      if (mbmi->mode != ZEROMV) {
+        vp9_find_best_ref_mvs(xd,
+                              xd->pre.y_buffer,
+                              recon_y_stride,
+                              mbmi->ref_mvs[ref_frame],
+                              &nearest, &nearby);
+
+        best_mv.as_int = (mbmi->ref_mvs[ref_frame][0]).as_int;
+      }
+
 #ifdef DEC_DEBUG
      if (dec_debug)
        printf("[D %d %d] %d %d %d %d\n", ref_frame,
@ -804,32 +791,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 #endif
    }

-    // Is the segment level mode feature enabled for this segment
-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
-      mbmi->mode =
-        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
-    } else {
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
-      } else
-#endif
-      mbmi->mode = read_mv_ref(bc, mv_ref_p);
-
-      vp9_accum_mv_refs(&pbi->common, mbmi->mode,
-                        mbmi->mb_mode_context[mbmi->ref_frame]);
-    }
-
-#if CONFIG_PRED_FILTER
-    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {
-      // Is the prediction filter enabled
-      if (cm->pred_filter_mode == 2)
-        mbmi->pred_filter_enabled =
-          vp9_read(bc, cm->prob_pred_filter_off);
-      else
-        mbmi->pred_filter_enabled = cm->pred_filter_mode;
-    }
-#endif
    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)
    {
      if (cm->mcomp_filter_type == SWITCHABLE) {
@ -877,13 +838,15 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                         mbmi->ref_mvs[mbmi->second_ref_frame],
                         cm->ref_frame_sign_bias);

-        vp9_find_best_ref_mvs(xd,
-                              xd->second_pre.y_buffer,
-                              recon_y_stride,
-                              mbmi->ref_mvs[mbmi->second_ref_frame],
-                              &best_mv_second,
-                              &nearest_second,
-                              &nearby_second);
+        if (mbmi->mode != ZEROMV) {
+          vp9_find_best_ref_mvs(xd,
+                                xd->second_pre.y_buffer,
+                                recon_y_stride,
+                                mbmi->ref_mvs[mbmi->second_ref_frame],
+                                &nearest_second,
+                                &nearby_second);
+          best_mv_second = mbmi->ref_mvs[mbmi->second_ref_frame][0];
+        }
      }

    } else {
@ -916,6 +879,29 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 #endif
    }

+#if CONFIG_NEW_MVREF
+    // if ((mbmi->mode == NEWMV) || (mbmi->mode == SPLITMV))
+    if (mbmi->mode == NEWMV) {
+      int best_index;
+      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+
+      // Encode the index of the choice.
+      best_index =
+        vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);
+
+      best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+
+      if (mbmi->second_ref_frame > 0) {
+        ref_frame = mbmi->second_ref_frame;
+
+        // Encode the index of the choice.
+        best_index =
+          vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);
+        best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+      }
+    }
+#endif
+
    mbmi->uv_mode = DC_PRED;
    switch (mbmi->mode) {
      case SPLITMV: {
@ -1072,19 +1058,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,

      case NEWMV:

-#if CONFIG_NEW_MVREF
-        {
-          int best_index;
-          MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-
-          // Encode the index of the choice.
-          best_index =
-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
-
-          best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-        }
-#endif
-
        read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
        read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
                    xd->allow_high_precision_mv);
@ -1106,18 +1079,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                                                  mb_to_bottom_edge);

        if (mbmi->second_ref_frame > 0) {
-#if CONFIG_NEW_MVREF
-        {
-          int best_index;
-          MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;
-
-          // Encode the index of the choice.
-          best_index =
-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
-          best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-        }
-#endif
-
          read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);
          read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,
                      xd->allow_high_precision_mv);
@ -1144,27 +1105,19 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
      mbmi->mode = (MB_PREDICTION_MODE)
                   vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
-#if CONFIG_SUPERBLOCKS
-    } else if (mbmi->encoded_as_sb) {
+    } else if (mbmi->sb_type) {
      mbmi->mode = (MB_PREDICTION_MODE)
                   read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);
      pbi->common.fc.sb_ymode_counts[mbmi->mode]++;
-#endif
    } else {
      mbmi->mode = (MB_PREDICTION_MODE)
                   read_ymode(bc, pbi->common.fc.ymode_prob);
      pbi->common.fc.ymode_counts[mbmi->mode]++;
    }
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif

    // If MB mode is BPRED read the block modes
    if (mbmi->mode == B_PRED) {
      int j = 0;
-#if CONFIG_COMP_INTRA_PRED
-      int use_comp_pred = vp9_read(bc, DEFAULT_COMP_INTRA_PROB);
-#endif
      do {
        int m;
        m = mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)
@ -1173,13 +1126,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
        if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
        pbi->common.fc.bmode_counts[m]++;
-#if CONFIG_COMP_INTRA_PRED
-        if (use_comp_pred) {
-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
-        } else {
-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
-        }
-#endif
      } while (++j < 16);
    }

@ -1194,22 +1140,12 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
        mi->bmi[ib + 4].as_mode.first = mode8x8;
        mi->bmi[ib + 5].as_mode.first = mode8x8;
        pbi->common.fc.i8x8_mode_counts[mode8x8]++;
-#if CONFIG_COMP_INTRA_PRED
-        mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
      }
    } else {
      mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
      pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
    }
-
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
  }

  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
@ -1219,8 +1155,13 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
    // FIXME(rbultje) code ternary symbol once all experiments are merged
    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV)
+        mbmi->mode != SPLITMV) {
      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
+      if (mbmi->sb_type && mbmi->txfm_size != TX_8X8)
+        mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]);
+    }
+  } else if (mbmi->sb_type && cm->txfm_mode >= ALLOW_32X32) {
+    mbmi->txfm_size = TX_32X32;
  } else if (cm->txfm_mode >= ALLOW_16X16 &&
      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@ -8,6 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#ifndef VP9_DECODER_VP9_DECODEMV_H_
+#define VP9_DECODER_VP9_DECODEMV_H_

 #include "vp9/decoder/vp9_onyxd_int.h"

@ -17,3 +19,5 @@ void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
                           int mb_col,
                           BOOL_DECODER* const bc);
 void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);
+
+#endif  // VP9_DECODER_VP9_DECODEMV_H_
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@ -10,17 +10,19 @@


 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_header.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconintra4x4.h"
 #include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/decoder/vp9_decodframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/common/vp9_invtrans.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_setupintrarecon.h"

 #include "vp9/decoder/vp9_decodemv.h"
@ -81,17 +83,17 @@ void vp9_init_de_quantizer(VP9D_COMP *pbi) {
  VP9_COMMON *const pc = &pbi->common;

  for (Q = 0; Q < QINDEX_RANGE; Q++) {
-    pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);
-    pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);
-    pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
+    pc->Y1dequant[Q][0] = (int16_t)vp9_dc_quant(Q, pc->y1dc_delta_q);
+    pc->Y2dequant[Q][0] = (int16_t)vp9_dc2quant(Q, pc->y2dc_delta_q);
+    pc->UVdequant[Q][0] = (int16_t)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);

    /* all the ac values =; */
    for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d[i];
+      int rc = vp9_default_zig_zag1d_4x4[i];

-      pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);
-      pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);
-      pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
+      pc->Y1dequant[Q][rc] = (int16_t)vp9_ac_yquant(Q);
+      pc->Y2dequant[Q][rc] = (int16_t)vp9_ac2quant(Q, pc->y2ac_delta_q);
+      pc->UVdequant[Q][rc] = (int16_t)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
    }
  }
 }
@ -170,20 +172,25 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
 */
 static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+      vp9_build_intra_predictors_sb64uv_s(xd);
+      vp9_build_intra_predictors_sb64y_s(xd);
+    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
      vp9_build_intra_predictors_sbuv_s(xd);
      vp9_build_intra_predictors_sby_s(xd);
    } else {
-#endif
-    vp9_build_intra_predictors_mbuv_s(xd);
-    vp9_build_intra_predictors_mby_s(xd);
-#if CONFIG_SUPERBLOCKS
+      vp9_build_intra_predictors_mbuv_s(xd);
+      vp9_build_intra_predictors_mby_s(xd);
    }
-#endif
  } else {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+      vp9_build_inter64x64_predictors_sb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride);
+    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
      vp9_build_inter32x32_predictors_sb(xd,
                                         xd->dst.y_buffer,
                                         xd->dst.u_buffer,
@ -191,35 +198,32 @@ static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
                                         xd->dst.y_stride,
                                         xd->dst.uv_stride);
    } else {
-#endif
-    vp9_build_1st_inter16x16_predictors_mb(xd,
-                                           xd->dst.y_buffer,
-                                           xd->dst.u_buffer,
-                                           xd->dst.v_buffer,
-                                           xd->dst.y_stride,
-                                           xd->dst.uv_stride);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter16x16_predictors_mb(xd,
+      vp9_build_1st_inter16x16_predictors_mb(xd,
                                             xd->dst.y_buffer,
                                             xd->dst.u_buffer,
                                             xd->dst.v_buffer,
                                             xd->dst.y_stride,
                                             xd->dst.uv_stride);
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-      vp9_build_interintra_16x16_predictors_mb(xd,
+
+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+        vp9_build_2nd_inter16x16_predictors_mb(xd,
                                               xd->dst.y_buffer,
                                               xd->dst.u_buffer,
                                               xd->dst.v_buffer,
                                               xd->dst.y_stride,
                                               xd->dst.uv_stride);
-    }
+      }
+#if CONFIG_COMP_INTERINTRA_PRED
+      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+        vp9_build_interintra_16x16_predictors_mb(xd,
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.u_buffer,
+                                                 xd->dst.v_buffer,
+                                                 xd->dst.y_stride,
+                                                 xd->dst.uv_stride);
+      }
 #endif
-#if CONFIG_SUPERBLOCKS
    }
-#endif
  }
 }

@ -283,10 +287,10 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
    for (i = 0; i < 4; i++) {
      int ib = vp9_i8x8_block[i];
      int idx = (ib & 0x02) ? (ib + 2) : ib;
-      short *q  = xd->block[idx].qcoeff;
-      short *dq = xd->block[0].dequant;
-      unsigned char *pre = xd->block[ib].predictor;
-      unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
+      int16_t *q  = xd->block[idx].qcoeff;
+      int16_t *dq = xd->block[0].dequant;
+      uint8_t *pre = xd->block[ib].predictor;
+      uint8_t *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
      int stride = xd->dst.y_stride;
      BLOCKD *b = &xd->block[ib];
      if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
@ -414,9 +418,6 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
    assert(get_2nd_order_usage(xd) == 0);
    for (i = 0; i < 16; i++) {
      int b_mode;
-#if CONFIG_COMP_INTRA_PRED
-      int b_mode2;
-#endif
      BLOCKD *b = &xd->block[i];
      b_mode = xd->mode_info_context->bmi[i].as_mode.first;
 #if CONFIG_NEWBINTRAMODES
@ -425,17 +426,8 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
 #endif
      if (!xd->mode_info_context->mbmi.mb_skip_coeff)
        eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
-#if CONFIG_COMP_INTRA_PRED
-      b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;

-      if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
-        vp9_intra4x4_predict(b, b_mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);
-      }
-#endif
+      vp9_intra4x4_predict(b, b_mode, b->predictor);
      tx_type = get_tx_type_4x4(xd, b);
      if (tx_type != DCT_DCT) {
        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
@ -446,12 +438,12 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                             *(b->base_dst) + b->dst, 16, b->dst_stride);
      }
-      xd->above_context->y2 = 1;
-      xd->left_context->y2 = 1;
    }
    if (!xd->mode_info_context->mbmi.mb_skip_coeff) {
      vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc);
    }
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
    vp9_build_intra_predictors_mbuv(xd);
    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
                           xd->block[16].dequant,
@ -546,10 +538,10 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
  }
 }

-#if CONFIG_SUPERBLOCKS
 static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                            BOOL_DECODER* const bc, int n) {
-  int x_idx = n & 1, y_idx = n >> 1;
+                            BOOL_DECODER* const bc, int n,
+                            int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
  TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
  if (tx_type != DCT_DCT) {
    vp9_ht_dequant_idct_add_16x16_c(
@ -573,17 +565,18 @@ static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
 };

 static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n) {
+                          BOOL_DECODER* const bc, int n,
+                          int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
  BLOCKD *b = &xd->block[24];
-  int x_idx = n & 1, y_idx = n >> 1;
  TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);
  if (tx_type != DCT_DCT) {
    int i;
    for (i = 0; i < 4; i++) {
      int ib = vp9_i8x8_block[i];
      int idx = (ib & 0x02) ? (ib + 2) : ib;
-      short *q  = xd->block[idx].qcoeff;
-      short *dq = xd->block[0].dequant;
+      int16_t *q  = xd->block[idx].qcoeff;
+      int16_t *dq = xd->block[0].dequant;
      int stride = xd->dst.y_stride;
      BLOCKD *b = &xd->block[ib];
      tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
@ -634,9 +627,10 @@ static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
 };

 static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n) {
+                          BOOL_DECODER* const bc, int n,
+                          int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
  BLOCKD *b = &xd->block[24];
-  int x_idx = n & 1, y_idx = n >> 1;
  TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[0]);
  if (tx_type != DCT_DCT) {
    int i;
@ -689,15 +683,142 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
      xd->dst.uv_stride, xd->eobs + 16, xd);
 };

-static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                              int mb_row, unsigned int mb_col,
-                              BOOL_DECODER* const bc) {
+static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                                int mb_row, unsigned int mb_col,
+                                BOOL_DECODER* const bc) {
  int i, n, eobtotal;
  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
  VP9_COMMON *const pc = &pbi->common;
  MODE_INFO *orig_mi = xd->mode_info_context;
+  const int mis = pc->mode_info_stride;

-  assert(xd->mode_info_context->mbmi.encoded_as_sb);
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64);
+
+  if (pbi->common.frame_type != KEY_FRAME)
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
+
+  // re-initialize macroblock dequantizer before detokenization
+  if (xd->segmentation_enabled)
+    mb_init_dequantizer(pbi, xd);
+
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    int n;
+
+    vp9_reset_mb_tokens_context(xd);
+    for (n = 1; n <= 3; n++) {
+      if (mb_col < pc->mb_cols - n)
+        xd->above_context += n;
+      if (mb_row < pc->mb_rows - n)
+        xd->left_context += n;
+      vp9_reset_mb_tokens_context(xd);
+      if (mb_col < pc->mb_cols - n)
+        xd->above_context -= n;
+      if (mb_row < pc->mb_rows - n)
+        xd->left_context -= n;
+    }
+
+    /* Special case:  Force the loopfilter to skip when eobtotal and
+     * mb_skip_coeff are zero.
+     */
+    skip_recon_mb(pbi, xd);
+    return;
+  }
+
+  /* do prediction */
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sb64y_s(xd);
+    vp9_build_intra_predictors_sb64uv_s(xd);
+  } else {
+    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  /* dequantization and idct */
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    for (n = 0; n < 4; n++) {
+      const int x_idx = n & 1, y_idx = n >> 1;
+
+      if (mb_col + x_idx * 2 >= pc->mb_cols ||
+          mb_row + y_idx * 2 >= pc->mb_rows)
+        continue;
+
+      xd->left_context = pc->left_context + (y_idx << 1);
+      xd->above_context = pc->above_context + mb_col + (x_idx << 1);
+      xd->mode_info_context = orig_mi + x_idx * 2 + y_idx * 2 * mis;
+      eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        if (mb_col + 1 < pc->mb_cols)
+          xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+        if (mb_row + 1 < pc->mb_rows) {
+          xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
+          if (mb_col + 1 < pc->mb_cols)
+            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+        }
+      } else {
+        vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
+                                   xd->dst.y_buffer + x_idx * 32 +
+                                       xd->dst.y_stride * y_idx * 32,
+                                   xd->dst.y_buffer + x_idx * 32 +
+                                       xd->dst.y_stride * y_idx * 32,
+                                   xd->dst.y_stride, xd->dst.y_stride,
+                                   xd->eobs[0]);
+        vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
+                                              xd->block[16].dequant,
+                                              xd->dst.u_buffer + x_idx * 16 +
+                                                xd->dst.uv_stride * y_idx * 16,
+                                              xd->dst.v_buffer + x_idx * 16 +
+                                                xd->dst.uv_stride * y_idx * 16,
+                                              xd->dst.uv_stride, xd->eobs + 16);
+      }
+    }
+  } else {
+    for (n = 0; n < 16; n++) {
+      int x_idx = n & 3, y_idx = n >> 2;
+
+      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
+        continue;
+
+      xd->above_context = pc->above_context + mb_col + x_idx;
+      xd->left_context = pc->left_context + y_idx;
+      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
+      for (i = 0; i < 25; i++) {
+        xd->block[i].eob = 0;
+        xd->eobs[i] = 0;
+      }
+
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        continue;
+      }
+
+      if (tx_size == TX_16X16) {
+        decode_16x16_sb(pbi, xd, bc, n, 3, 2);
+      } else if (tx_size == TX_8X8) {
+        decode_8x8_sb(pbi, xd, bc, n, 3, 2);
+      } else {
+        decode_4x4_sb(pbi, xd, bc, n, 3, 2);
+      }
+    }
+  }
+
+  xd->above_context = pc->above_context + mb_col;
+  xd->left_context = pc->left_context;
+  xd->mode_info_context = orig_mi;
+}
+
+static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                                int mb_row, unsigned int mb_col,
+                                BOOL_DECODER* const bc) {
+  int i, n, eobtotal;
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  VP9_COMMON *const pc = &pbi->common;
+  MODE_INFO *orig_mi = xd->mode_info_context;
+  const int mis = pc->mode_info_stride;
+
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32);

  if (pbi->common.frame_type != KEY_FRAME)
    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
@ -736,41 +857,62 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
  }

  /* dequantization and idct */
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-
-    if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
-      continue;
-
-
-    xd->above_context = pc->above_context + mb_col + x_idx;
-    xd->left_context = pc->left_context + y_idx;
-    xd->mode_info_context = orig_mi + x_idx + y_idx * pc->mode_info_stride;
-    for (i = 0; i < 25; i++) {
-      xd->block[i].eob = 0;
-      xd->eobs[i] = 0;
-    }
-
-    eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
    if (eobtotal == 0) {  // skip loopfilter
      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-      continue;
-    }
-
-    if (tx_size == TX_16X16) {
-      decode_16x16_sb(pbi, xd, bc, n);
-    } else if (tx_size == TX_8X8) {
-      decode_8x8_sb(pbi, xd, bc, n);
+      if (mb_col + 1 < pc->mb_cols)
+        xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+      if (mb_row + 1 < pc->mb_rows) {
+        xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
+        if (mb_col + 1 < pc->mb_cols)
+          xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+      }
    } else {
-      decode_4x4_sb(pbi, xd, bc, n);
+      vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
+                                 xd->dst.y_buffer, xd->dst.y_buffer,
+                                 xd->dst.y_stride, xd->dst.y_stride,
+                                 xd->eobs[0]);
+      vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
+                                            xd->block[16].dequant,
+                                            xd->dst.u_buffer, xd->dst.v_buffer,
+                                            xd->dst.uv_stride, xd->eobs + 16);
    }
-  }
+  } else {
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;

-  xd->above_context = pc->above_context + mb_col;
-  xd->left_context = pc->left_context;
-  xd->mode_info_context = orig_mi;
+      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
+        continue;
+
+      xd->above_context = pc->above_context + mb_col + x_idx;
+      xd->left_context = pc->left_context + y_idx + (mb_row & 2);
+      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
+      for (i = 0; i < 25; i++) {
+        xd->block[i].eob = 0;
+        xd->eobs[i] = 0;
+      }
+
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        continue;
+      }
+
+      if (tx_size == TX_16X16) {
+        decode_16x16_sb(pbi, xd, bc, n, 1, 1);
+      } else if (tx_size == TX_8X8) {
+        decode_8x8_sb(pbi, xd, bc, n, 1, 1);
+      } else {
+        decode_4x4_sb(pbi, xd, bc, n, 1, 1);
+      }
+    }
+
+    xd->above_context = pc->above_context + mb_col;
+    xd->left_context = pc->left_context + (mb_row & 2);
+    xd->mode_info_context = orig_mi;
+  }
 }
-#endif

 static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
                              int mb_row, unsigned int mb_col,
@ -780,9 +922,7 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
  int i;
  int tx_size;

-#if CONFIG_SUPERBLOCKS
-  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
-#endif
+  assert(!xd->mode_info_context->mbmi.sb_type);

  // re-initialize macroblock dequantizer before detokenization
  if (xd->segmentation_enabled)
@ -904,192 +1044,176 @@ static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {
 FILE *vpxlog = 0;
 #endif

+static void set_offsets(VP9D_COMP *pbi, int block_size,
+                        int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int mis = cm->mode_info_stride;
+  const int idx = mis * mb_row + mb_col;
+  const int dst_fb_idx = cm->new_fb_idx;
+  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
+  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
+  const int recon_yoffset = mb_row * 16 * recon_y_stride + 16 * mb_col;
+  const int recon_uvoffset = mb_row * 8 * recon_uv_stride + 8 * mb_col;
+
+  xd->mode_info_context = cm->mi + idx;
+  xd->mode_info_context->mbmi.sb_type = block_size >> 5;
+  xd->prev_mode_info_context = cm->prev_mi + idx;
+  xd->above_context = cm->above_context + mb_col;
+  xd->left_context = cm->left_context + (mb_row & 3);
+
+  /* Distance of Mb to the various image edges.
+   * These are specified to 8th pel as they are always compared to
+   * values that are in 1/8th pel units
+   */
+  block_size >>= 4;  // in mb units
+  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+  xd->mb_to_left_edge = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+  xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
+
+  xd->up_available = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+}
+
+static void set_refs(VP9D_COMP *pbi, int block_size,
+                     int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (mbmi->ref_frame > INTRA_FRAME) {
+    int ref_fb_idx, ref_yoffset, ref_uvoffset, ref_y_stride, ref_uv_stride;
+
+    /* Select the appropriate reference frame for this MB */
+    if (mbmi->ref_frame == LAST_FRAME)
+      ref_fb_idx = cm->lst_fb_idx;
+    else if (mbmi->ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cm->gld_fb_idx;
+    else
+      ref_fb_idx = cm->alt_fb_idx;
+
+    ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+    ref_yoffset = mb_row * 16 * ref_y_stride + 16 * mb_col;
+    xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + ref_yoffset;
+    ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+    ref_uvoffset = mb_row * 8 * ref_uv_stride + 8 * mb_col;
+    xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + ref_uvoffset;
+    xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + ref_uvoffset;
+
+    /* propagate errors from reference frames */
+    xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
+
+    if (mbmi->second_ref_frame > INTRA_FRAME) {
+      int second_ref_fb_idx;
+
+      /* Select the appropriate reference frame for this MB */
+      if (mbmi->second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cm->lst_fb_idx;
+      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cm->gld_fb_idx;
+      else
+        second_ref_fb_idx = cm->alt_fb_idx;
+
+      xd->second_pre.y_buffer =
+          cm->yv12_fb[second_ref_fb_idx].y_buffer + ref_yoffset;
+      xd->second_pre.u_buffer =
+          cm->yv12_fb[second_ref_fb_idx].u_buffer + ref_uvoffset;
+      xd->second_pre.v_buffer =
+          cm->yv12_fb[second_ref_fb_idx].v_buffer + ref_uvoffset;
+
+      /* propagate errors from reference frames */
+      xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
+    }
+  }
+
+  if (mbmi->sb_type) {
+    const int n_mbs = 1 << mbmi->sb_type;
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int mis = cm->mode_info_stride;
+    int x, y;
+
+    for (y = 0; y < y_mbs; y++) {
+      for (x = !y; x < x_mbs; x++) {
+        mi[y * mis + x] = *mi;
+      }
+    }
+  }
+}
+
 /* Decode a row of Superblocks (2x2 region of MBs) */
-static void
-decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
-              BOOL_DECODER* const bc) {
-  int i;
-  int sb_col;
-  int mb_row, mb_col;
-  int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = pc->lst_fb_idx;
-  int dst_fb_idx = pc->new_fb_idx;
-  int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-  int sb_cols = (pc->mb_cols + 1) >> 1;
+static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
+                          int mb_row, MACROBLOCKD *xd,
+                          BOOL_DECODER* const bc) {
+  int mb_col;

  // For a SB there are 2 left contexts, each pertaining to a MB row within
  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));

-  mb_row = mbrow;
-  mb_col = 0;
-
-  for (sb_col = 0; sb_col < sb_cols; sb_col++) {
-#if CONFIG_SUPERBLOCKS
-    MODE_INFO *mi = xd->mode_info_context;
-
-    mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);
-#endif
-
-    // Process the 4 MBs within the SB in the order:
-    // top-left, top-right, bottom-left, bottom-right
-    for (i = 0; i < 4; i++) {
-      int dy = row_delta[i];
-      int dx = col_delta[i];
-      int offset_extended = dy * xd->mode_info_stride + dx;
-
-      xd->mb_index = i;
-
-#if CONFIG_SUPERBLOCKS
-      mi = xd->mode_info_context;
-#endif
-      if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
-        // MB lies outside frame, skip on to next
-        mb_row += dy;
-        mb_col += dx;
-        xd->mode_info_context += offset_extended;
-        xd->prev_mode_info_context += offset_extended;
-        continue;
-      }
-#if CONFIG_SUPERBLOCKS
-      if (i)
-        mi->mbmi.encoded_as_sb = 0;
-#endif
-
-      // Set above context pointer
-      xd->above_context = pc->above_context + mb_col;
-      xd->left_context = pc->left_context + (i >> 1);
-
-      /* Distance of Mb to the various image edges.
-       * These are specified to 8th pel as they are always compared to
-       * values that are in 1/8th pel units
-       */
-      xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-      xd->mb_to_left_edge = -((mb_col * 16) << 3);
-#if CONFIG_SUPERBLOCKS
-      if (mi->mbmi.encoded_as_sb) {
-        xd->mb_to_bottom_edge = ((pc->mb_rows - 2 - mb_row) * 16) << 3;
-        xd->mb_to_right_edge = ((pc->mb_cols - 2 - mb_col) * 16) << 3;
-      } else {
-#endif
-        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-      }
-#endif
-#ifdef DEC_DEBUG
-      dec_debug = (pbi->common.current_video_frame == 1 &&
-                   mb_row == 2 && mb_col == 8);
-      if (dec_debug)
-#if CONFIG_SUPERBLOCKS
-        printf("Enter Debug %d %d sb %d\n", mb_row, mb_col,
-               mi->mbmi.encoded_as_sb);
-#else
-        printf("Enter Debug %d %d\n", mb_row, mb_col);
-#endif
-#endif
-      xd->up_available = (mb_row != 0);
-      xd->left_available = (mb_col != 0);
-
-
-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-      xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-      xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-      xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
+  for (mb_col = 0; mb_col < pc->mb_cols; mb_col += 4) {
+    if (vp9_read(bc, pc->sb64_coded)) {
+      set_offsets(pbi, 64, mb_row, mb_col);
      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
+      set_refs(pbi, 64, mb_row, mb_col);
+      decode_superblock64(pbi, xd, mb_row, mb_col, bc);
+      xd->corrupted |= bool_error(bc);
+    } else {
+      int j;

-      update_blockd_bmi(xd);
-#ifdef DEC_DEBUG
-      if (dec_debug)
-        printf("Hello\n");
-#endif
+      for (j = 0; j < 4; j++) {
+        const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;

-      /* Select the appropriate reference frame for this MB */
-      if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-        ref_fb_idx = pc->lst_fb_idx;
-      else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-        ref_fb_idx = pc->gld_fb_idx;
-      else
-        ref_fb_idx = pc->alt_fb_idx;
+        if (mb_row + y_idx_sb >= pc->mb_rows ||
+            mb_col + x_idx_sb >= pc->mb_cols) {
+          // MB lies outside frame, skip on to next
+          continue;
+        }

-      xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-      xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+        xd->sb_index = j;

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        int second_ref_fb_idx;
+        if (vp9_read(bc, pc->sb32_coded)) {
+          set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
+          vp9_decode_mb_mode_mv(pbi,
+                                xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
+          set_refs(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
+          decode_superblock32(pbi,
+                              xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
+          xd->corrupted |= bool_error(bc);
+        } else {
+          int i;

-        /* Select the appropriate reference frame for this MB */
-        if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-          second_ref_fb_idx = pc->lst_fb_idx;
-        else if (xd->mode_info_context->mbmi.second_ref_frame ==
-                 GOLDEN_FRAME)
-          second_ref_fb_idx = pc->gld_fb_idx;
-        else
-          second_ref_fb_idx = pc->alt_fb_idx;
+          // Process the 4 MBs within the SB in the order:
+          // top-left, top-right, bottom-left, bottom-right
+          for (i = 0; i < 4; i++) {
+            const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);

-        xd->second_pre.y_buffer =
-          pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-        xd->second_pre.u_buffer =
-          pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->second_pre.v_buffer =
-          pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
-      }
+            if (mb_row + y_idx >= pc->mb_rows ||
+                mb_col + x_idx >= pc->mb_cols) {
+              // MB lies outside frame, skip on to next
+              continue;
+            }

-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        /* propagate errors from reference frames */
-        xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
-      }
+            set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);
+            xd->mb_index = i;
+            vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
+            update_blockd_bmi(xd);
+            set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);
+            vp9_intra_prediction_down_copy(xd);
+            decode_macroblock(pbi, xd, mb_row, mb_col, bc);

-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (mb_col < pc->mb_cols - 1)
-          mi[1] = mi[0];
-        if (mb_row < pc->mb_rows - 1) {
-          mi[pc->mode_info_stride] = mi[0];
-          if (mb_col < pc->mb_cols - 1)
-            mi[pc->mode_info_stride + 1] = mi[0];
+            /* check if the boolean decoder has suffered an error */
+            xd->corrupted |= bool_error(bc);
+          }
        }
      }
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        decode_superblock(pbi, xd, mb_row, mb_col, bc);
-      } else {
-#endif
-        vp9_intra_prediction_down_copy(xd);
-        decode_macroblock(pbi, xd, mb_row, mb_col, bc);
-#if CONFIG_SUPERBLOCKS
-      }
-#endif
-
-      /* check if the boolean decoder has suffered an error */
-      xd->corrupted |= bool_error(bc);
-
-#if CONFIG_SUPERBLOCKS
-      if (mi->mbmi.encoded_as_sb) {
-        assert(!i);
-        mb_col += 2;
-        xd->mode_info_context += 2;
-        xd->prev_mode_info_context += 2;
-        break;
-      }
-#endif
-
-      // skip to next MB
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-      mb_row += dy;
-      mb_col += dx;
    }
  }
-
-  /* skip prediction column */
-  xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
-  xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
 }

 static unsigned int read_partition_size(const unsigned char *cx_size) {
@ -1212,14 +1336,13 @@ static void init_frame(VP9D_COMP *pbi) {

 }

-static void read_coef_probs_common(
-    BOOL_DECODER* const bc,
-    vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]
-                       [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+static void read_coef_probs_common(BOOL_DECODER* const bc,
+                                   vp9_coeff_probs *coef_probs,
+                                   int block_types) {
  int i, j, k, l;

  if (vp9_read_bit(bc)) {
-    for (i = 0; i < BLOCK_TYPES; i++) {
+    for (i = 0; i < block_types; i++) {
      for (j = !i; j < COEF_BANDS; j++) {
        /* NB: This j loop starts from 1 on block type i == 0 */
        for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
@ -1242,16 +1365,20 @@ static void read_coef_probs_common(
 static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
  VP9_COMMON *const pc = &pbi->common;

-  read_coef_probs_common(bc, pc->fc.coef_probs);
-  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);
+  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES_4X4);
+  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4);

  if (pbi->common.txfm_mode != ONLY_4X4) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);
+    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES_8X8);
+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8);
  }
  if (pbi->common.txfm_mode > ALLOW_8X8) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
+    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES_16X16);
+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16,
+                           BLOCK_TYPES_16X16);
+  }
+  if (pbi->common.txfm_mode > ALLOW_16X16) {
+    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES_32X32);
  }
 }

@ -1437,15 +1564,17 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
    }
  }

-#if CONFIG_SUPERBLOCKS
-  pc->sb_coded = vp9_read_literal(&header_bc, 8);
-#endif
+  pc->sb64_coded = vp9_read_literal(&header_bc, 8);
+  pc->sb32_coded = vp9_read_literal(&header_bc, 8);

  /* Read the loop filter level and type */
  pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+  if (pc->txfm_mode == 3)
+    pc->txfm_mode += vp9_read_bit(&header_bc);
  if (pc->txfm_mode == TX_MODE_SELECT) {
    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+    pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
  }

  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
@ -1577,6 +1706,33 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
    }
  }

+#if CONFIG_NEW_MVREF
+  // If Key frame reset mv ref id probabilities to defaults
+  if (pc->frame_type == KEY_FRAME) {
+    // Defaults probabilities for encoding the MV ref id signal
+    vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,
+               sizeof(xd->mb_mv_ref_probs));
+  } else {
+    // Read any mv_ref index probability updates
+    int i, j;
+
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      // Skip the dummy entry for intra ref frame.
+      if (i == INTRA_FRAME) {
+        continue;
+      }
+
+      // Read any updates to probabilities
+      for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {
+        if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) {
+          xd->mb_mv_ref_probs[i][j] =
+            (vp9_prob)vp9_read_literal(&header_bc, 8);
+        }
+      }
+    }
+  }
+#endif
+
  if (0) {
    FILE *z = fopen("decodestats.stt", "a");
    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
@ -1589,10 +1745,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
    fclose(z);
  }

-  vp9_copy(pbi->common.fc.pre_coef_probs,
-           pbi->common.fc.coef_probs);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,
-           pbi->common.fc.hybrid_coef_probs);
+  vp9_copy(pbi->common.fc.pre_coef_probs_4x4,
+           pbi->common.fc.coef_probs_4x4);
+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_4x4,
+           pbi->common.fc.hybrid_coef_probs_4x4);
  vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
           pbi->common.fc.coef_probs_8x8);
  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,
@ -1601,10 +1757,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
           pbi->common.fc.coef_probs_16x16);
  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
           pbi->common.fc.hybrid_coef_probs_16x16);
+  vp9_copy(pbi->common.fc.pre_coef_probs_32x32,
+           pbi->common.fc.coef_probs_32x32);
  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
-#if CONFIG_SUPERBLOCKS
  vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob);
-#endif
  vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);
  vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);
  vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);
@ -1614,16 +1770,15 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
  pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob;
 #endif
  pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
-  vp9_zero(pbi->common.fc.coef_counts);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts);
+  vp9_zero(pbi->common.fc.coef_counts_4x4);
+  vp9_zero(pbi->common.fc.hybrid_coef_counts_4x4);
  vp9_zero(pbi->common.fc.coef_counts_8x8);
  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
  vp9_zero(pbi->common.fc.coef_counts_16x16);
  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
+  vp9_zero(pbi->common.fc.coef_counts_32x32);
  vp9_zero(pbi->common.fc.ymode_counts);
-#if CONFIG_SUPERBLOCKS
  vp9_zero(pbi->common.fc.sb_ymode_counts);
-#endif
  vp9_zero(pbi->common.fc.uv_mode_counts);
  vp9_zero(pbi->common.fc.bmode_counts);
  vp9_zero(pbi->common.fc.i8x8_mode_counts);
@ -1662,12 +1817,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {

  vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);

-  // Resset the macroblock mode info context to the start of the list
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-
  /* Decode a row of superblocks */
-  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
+  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) {
    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
  }
  corrupt_tokens |= xd->corrupted;
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@ -16,4 +16,4 @@ struct VP9Decompressor;

 extern void vp9_init_de_quantizer(struct VP9Decompressor *pbi);

-#endif  // __INC_DECODFRAME_H
+#endif  // VP9_DECODER_VP9_DECODFRAME_H_
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@ -13,20 +13,14 @@
 #include "vp9/decoder/vp9_dequantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_common.h"
 static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
                         uint8_t *dest, int stride, int width, int height) {
  int r, c;

  for (r = 0; r < height; r++) {
    for (c = 0; c < width; c++) {
-      int a = diff[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-
-      dest[c] = (uint8_t) a;
+      dest[c] = clip_pixel(diff[c] + pred[c]);
    }

    dest += stride;
@ -42,14 +36,7 @@ static void add_constant_residual(const int16_t diff, const uint8_t *pred,

  for (r = 0; r < height; r++) {
    for (c = 0; c < width; c++) {
-      int a = diff + pred[c];
-
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-
-      dest[c] = (uint8_t) a;
+      dest[c] = clip_pixel(diff + pred[c]);
    }

    dest += stride;
@ -204,7 +191,7 @@ void vp9_dequantize_b_2x2_c(BLOCKD *d) {

 void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
                                uint8_t *pred, uint8_t *dest, int pitch,
-                                int stride, int dc, uint16_t eobs) {
+                                int stride, int dc, int eob) {
  int16_t output[64];
  int16_t *diff_ptr = output;
  int i;
@ -220,10 +207,10 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
   * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
   * Combine that with code here.
   */
-  if (eobs == 0) {
+  if (eob == 0) {
    /* All 0 DCT coefficient */
    vp9_copy_mem8x8(pred, pitch, dest, stride);
-  } else if (eobs == 1) {
+  } else if (eob == 1) {
    /* DC only DCT coefficient. */
    int16_t out;

@ -236,7 +223,7 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
    input[0] = 0;

    add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
-  } else if (eobs <= 10) {
+  } else if (eob <= 10) {
    input[1] = input[1] * dq[1];
    input[2] = input[2] * dq[1];
    input[3] = input[3] * dq[1];
@ -301,17 +288,17 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,

 void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
                                  uint8_t *pred, uint8_t *dest, int pitch,
-                                  int stride, uint16_t eobs) {
+                                  int stride, int eob) {
  int16_t output[256];
  int16_t *diff_ptr = output;
  int i;

  /* The calculation can be simplified if there are not many non-zero dct
   * coefficients. Use eobs to separate different cases. */
-  if (eobs == 0) {
+  if (eob == 0) {
    /* All 0 DCT coefficient */
    vp9_copy_mem16x16(pred, pitch, dest, stride);
-  } else if (eobs == 1) {
+  } else if (eob == 1) {
    /* DC only DCT coefficient. */
    int16_t out;

@ -324,7 +311,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
    input[0] = 0;

    add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
-  } else if (eobs <= 10) {
+  } else if (eob <= 10) {
    input[0]= input[0] * dq[0];
    input[1] = input[1] * dq[1];
    input[2] = input[2] * dq[1];
@ -360,3 +347,28 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
  }
 }
+
+void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
+                                  uint8_t *pred, uint8_t *dest, int pitch,
+                                  int stride, int eob) {
+  int16_t output[1024];
+  int i;
+
+  input[0]= input[0] * dq[0] / 2;
+  for (i = 1; i < 1024; i++)
+    input[i] = input[i] * dq[1] / 2;
+  vp9_short_idct32x32_c(input, output, 64);
+  vpx_memset(input, 0, 2048);
+
+  add_residual(output, pred, pitch, dest, stride, 32, 32);
+}
+
+void vp9_dequant_idct_add_uv_block_16x16_c(int16_t *q, const int16_t *dq,
+                                           uint8_t *dstu,
+                                           uint8_t *dstv,
+                                           int stride,
+                                           uint16_t *eobs) {
+  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]);
+  vp9_dequant_idct_add_16x16_c(q + 256, dq,
+                               dstv, dstv, stride, stride, eobs[4]);
+}
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@ -14,90 +14,88 @@
 #include "vp9/common/vp9_blockd.h"

 #if CONFIG_LOSSLESS
-extern void vp9_dequant_idct_add_lossless_c(short *input, const short *dq,
+extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
                                            unsigned char *pred,
                                            unsigned char *output,
                                            int pitch, int stride);
-extern void vp9_dequant_dc_idct_add_lossless_c(short *input, const short *dq,
+extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
                                               unsigned char *pred,
                                               unsigned char *output,
                                               int pitch, int stride, int dc);
-extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q,
-                                                       const short *dq,
+extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
+                                                       const int16_t *dq,
                                                       unsigned char *pre,
                                                       unsigned char *dst,
                                                       int stride,
-                                                       unsigned short *eobs,
-                                                       const short *dc);
-extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, const short *dq,
+                                                       uint16_t *eobs,
+                                                       const int16_t *dc);
+extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
                                                    unsigned char *pre,
                                                    unsigned char *dst,
                                                    int stride,
-                                                    unsigned short *eobs);
-extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, const short *dq,
+                                                    uint16_t *eobs);
+extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
                                                     unsigned char *pre,
                                                     unsigned char *dst_u,
                                                     unsigned char *dst_v,
                                                     int stride,
-                                                     unsigned short *eobs);
+                                                     uint16_t *eobs);
 #endif

-typedef void (*vp9_dequant_idct_add_fn_t)(short *input, const short *dq,
+typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,
    unsigned char *pred, unsigned char *output, int pitch, int stride);
-typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, const short *dq,
+typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,
    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);

-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, const short *dq,
-    unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs,
-    const short *dc);
-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, const short *dq,
-    unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs);
-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, const short *dq,
+typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,
+    const int16_t *dc);
+typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);
+typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,
    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
-    unsigned short *eobs);
+    uint16_t *eobs);

-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, const short *dq,
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
                                    unsigned char *pred, unsigned char *dest,
                                    int pitch, int stride, uint16_t eobs);

-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input,
-                                   const short *dq, unsigned char *pred,
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
+                                   const int16_t *dq, unsigned char *pred,
                                   unsigned char *dest, int pitch, int stride,
                                   uint16_t eobs);

-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input,
-                                     const short *dq, unsigned char *pred,
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
+                                     const int16_t *dq, unsigned char *pred,
                                     unsigned char *dest,
                                     int pitch, int stride, uint16_t eobs);

-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, const short *dq,
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
                                                   unsigned char *dst,
                                                   int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                   MACROBLOCKD *xd);

-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, const short *dq,
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
                                                   unsigned char *dst,
                                                   int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                   MACROBLOCKD *xd);

-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, const short *dq,
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
                                                 unsigned char *dstu,
                                                 unsigned char *dstv,
                                                 int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                 MACROBLOCKD *xd);

-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, const short *dq,
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
                                                 unsigned char *dstu,
                                                 unsigned char *dstv,
                                                 int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                 MACROBLOCKD *xd);
-#endif

 #endif
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@ -9,13 +9,11 @@
 */


-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vp9/decoder/vp9_detokenize.h"
-
 #include "vp9/common/vp9_seg_common.h"

 #define EOB_CONTEXT_NODE            0
@ -55,59 +53,38 @@
 #define CAT5_PROB3 157
 #define CAT5_PROB4 180

-static const unsigned char cat6_prob[14] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const vp9_prob cat6_prob[15] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
+};

-void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
-  /* Clear entropy contexts */
-  if ((xd->mode_info_context->mbmi.mode != B_PRED &&
-       xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-       xd->mode_info_context->mbmi.mode != SPLITMV)
-      || xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-  } else {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
-  }
-}
-
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);

 static int get_signed(BOOL_DECODER *br, int value_to_sign) {
-  const int split = (br->range + 1) >> 1;
-  const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
-  int v;
-
-  if (br->count < 0)
-    vp9_bool_decoder_fill(br);
-
-  if (br->value < bigsplit) {
-    br->range = split;
-    v = value_to_sign;
-  } else {
-    br->range = br->range - split;
-    br->value = br->value - bigsplit;
-    v = -value_to_sign;
-  }
-  br->range += br->range;
-  br->value += br->value;
-  --br->count;
-
-  return v;
+  return decode_bool(br, 128) ? -value_to_sign : value_to_sign;
 }

+#if CONFIG_NEWCOEFCONTEXT
+#define PT pn
+#define INCREMENT_COUNT(token)                       \
+  do {                                               \
+    coef_counts[type][coef_bands[c]][pn][token]++;   \
+    pn = pt = vp9_prev_token_class[token];           \
+    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(coef_bands[c + 1]))  \
+      pn = vp9_get_coef_neighbor_context(            \
+          qcoeff_ptr, nodc, neighbors, scan[c + 1]); \
+  } while (0)
+#else
+#define PT pt
 #define INCREMENT_COUNT(token)               \
  do {                                       \
-    coef_counts[coef_bands[c]][pt][token]++; \
-    pt = vp9_prev_token_class[token];        \
+    coef_counts[type][coef_bands[c]][pt][token]++; \
+    pt = vp9_prev_token_class[token];              \
  } while (0)
+#endif  /* CONFIG_NEWCOEFCONTEXT */

 #define WRITE_COEF_CONTINUE(val, token)                       \
  {                                                           \
-    qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val);        \
+    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);        \
    INCREMENT_COUNT(token);                                   \
    c++;                                                      \
    continue;                                                 \
@ -116,7 +93,7 @@ static int get_signed(BOOL_DECODER *br, int value_to_sign) {
 #define ADJUST_COEF(prob, bits_count)  \
  do {                                 \
    if (vp9_read(br, prob))            \
-      val += (UINT16)(1 << bits_count);\
+      val += (uint16_t)(1 << bits_count);\
  } while (0);

 static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
@ -124,51 +101,65 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        PLANE_TYPE type,
                        TX_TYPE tx_type,
-                        int seg_eob, INT16 *qcoeff_ptr,
+                        int seg_eob, int16_t *qcoeff_ptr,
                        const int *const scan, TX_SIZE txfm_size,
                        const int *coef_bands) {
  FRAME_CONTEXT *const fc = &dx->common.fc;
-  int pt, c = (type == PLANE_TYPE_Y_NO_DC);
-  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][ENTROPY_NODES], *prob;
-  unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+#if CONFIG_NEWCOEFCONTEXT
+  const int *neighbors;
+  int pn;
+#endif
+  int nodc = (type == PLANE_TYPE_Y_NO_DC);
+  int pt, c = nodc;
+  vp9_coeff_probs *coef_probs;
+  vp9_prob *prob;
+  vp9_coeff_count *coef_counts;

  switch (txfm_size) {
    default:
    case TX_4X4:
      if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs[type];
-        coef_counts = fc->coef_counts[type];
+        coef_probs  = fc->coef_probs_4x4;
+        coef_counts = fc->coef_counts_4x4;
      } else {
-        coef_probs  = fc->hybrid_coef_probs[type];
-        coef_counts = fc->hybrid_coef_counts[type];
+        coef_probs  = fc->hybrid_coef_probs_4x4;
+        coef_counts = fc->hybrid_coef_counts_4x4;
      }
      break;
    case TX_8X8:
      if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs_8x8[type];
-        coef_counts = fc->coef_counts_8x8[type];
+        coef_probs  = fc->coef_probs_8x8;
+        coef_counts = fc->coef_counts_8x8;
      } else {
-        coef_probs  = fc->hybrid_coef_probs_8x8[type];
-        coef_counts = fc->hybrid_coef_counts_8x8[type];
+        coef_probs  = fc->hybrid_coef_probs_8x8;
+        coef_counts = fc->hybrid_coef_counts_8x8;
      }
      break;
    case TX_16X16:
      if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs_16x16[type];
-        coef_counts = fc->coef_counts_16x16[type];
+        coef_probs  = fc->coef_probs_16x16;
+        coef_counts = fc->coef_counts_16x16;
      } else {
-        coef_probs  = fc->hybrid_coef_probs_16x16[type];
-        coef_counts = fc->hybrid_coef_counts_16x16[type];
+        coef_probs  = fc->hybrid_coef_probs_16x16;
+        coef_counts = fc->hybrid_coef_counts_16x16;
      }
      break;
+    case TX_32X32:
+      coef_probs = fc->coef_probs_32x32;
+      coef_counts = fc->coef_counts_32x32;
+      break;
  }

  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+#if CONFIG_NEWCOEFCONTEXT
+  pn = pt;
+  neighbors = vp9_get_coef_neighbors_handle(scan);
+#endif
  while (1) {
    int val;
    const uint8_t *cat6 = cat6_prob;
    if (c >= seg_eob) break;
-    prob = coef_probs[coef_bands[c]][pt];
+    prob = coef_probs[type][coef_bands[c]][PT];
    if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
      break;
 SKIP_START:
@ -176,7 +167,7 @@ SKIP_START:
    if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
      INCREMENT_COUNT(ZERO_TOKEN);
      ++c;
-      prob = coef_probs[coef_bands[c]][pt];
+      prob = coef_probs[type][coef_bands[c]][PT];
      goto SKIP_START;
    }
    // ONE_CONTEXT_NODE_0_
@ -240,7 +231,7 @@ SKIP_START:
  }

  if (c < seg_eob)
-    coef_counts[coef_bands[c]][pt][DCT_EOB_TOKEN]++;
+    coef_counts[type][coef_bands[c]][PT][DCT_EOB_TOKEN]++;

  a[0] = l[0] = (c > !type);

@ -256,38 +247,120 @@ static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
  return eob;
 }

+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc) {
+  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
+  ENTROPY_CONTEXT* const A1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]);
+  ENTROPY_CONTEXT* const L1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]);
+  uint16_t *const eobs = xd->eobs;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int c, i, eobtotal = 0, seg_eob;
+
+  // Luma block
+#if CONFIG_CNVCONTEXT
+  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3] +
+                              A1[0] + A1[1] + A1[2] + A1[3]) != 0;
+  ENTROPY_CONTEXT left_ec =  (L[0] + L[1] + L[2] + L[3] +
+                              L1[0] + L1[1] + L1[2] + L1[3]) != 0;
+#else
+  ENTROPY_CONTEXT above_ec = A[0];
+  ENTROPY_CONTEXT left_ec =  L[0];
+#endif
+  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,
+                             PLANE_TYPE_Y_WITH_DC,
+                             DCT_DCT, get_eob(xd, segment_id, 1024),
+                             xd->sb_coeff_data.qcoeff,
+                             vp9_default_zig_zag1d_32x32,
+                             TX_32X32, vp9_coef_bands_32x32);
+  A[1] = A[2] = A[3] = A[0] = above_ec;
+  L[1] = L[2] = L[3] = L[0] = left_ec;
+  A1[1] = A1[2] = A1[3] = A1[0] = above_ec;
+  L1[1] = L1[2] = L1[3] = L1[0] = left_ec;
+
+  eobtotal += c;
+
+  // 16x16 chroma blocks
+  seg_eob = get_eob(xd, segment_id, 256);
+
+  for (i = 16; i < 24; i += 4) {
+    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_16X16][i];
+    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_16X16][i];
+    ENTROPY_CONTEXT* const a1 = A1 + vp9_block2above[TX_16X16][i];
+    ENTROPY_CONTEXT* const l1 = L1 + vp9_block2left[TX_16X16][i];
+#if CONFIG_CNVCONTEXT
+    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+#else
+    above_ec = a[0];
+    left_ec = l[0];
+#endif
+
+    eobs[i] = c = decode_coefs(pbi, xd, bc,
+                               &above_ec, &left_ec,
+                               PLANE_TYPE_UV,
+                               DCT_DCT, seg_eob,
+                               xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+                               vp9_default_zig_zag1d_16x16,
+                               TX_16X16, vp9_coef_bands_16x16);
+
+    a1[1] = a1[0] = a[1] = a[0] = above_ec;
+    l1[1] = l1[0] = l[1] = l[0] = left_ec;
+    eobtotal += c;
+  }
+  // no Y2 block
+  A[8] = L[8] = A1[8] = L1[8] = 0;
+  return eobtotal;
+}

 static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
                                      MACROBLOCKD* const xd,
                                      BOOL_DECODER* const bc) {
  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
-  unsigned short* const eobs = xd->eobs;
+  uint16_t *const eobs = xd->eobs;
  const int segment_id = xd->mode_info_context->mbmi.segment_id;
  int c, i, eobtotal = 0, seg_eob;
-
  // Luma block
-  eobs[0] = c = decode_coefs(pbi, xd, bc, A, L, PLANE_TYPE_Y_WITH_DC,
+
+#if CONFIG_CNVCONTEXT
+  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
+  ENTROPY_CONTEXT left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+#else
+  ENTROPY_CONTEXT above_ec = A[0];
+  ENTROPY_CONTEXT left_ec = L[0];
+#endif
+  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,
+                             PLANE_TYPE_Y_WITH_DC,
                             get_tx_type(xd, &xd->block[0]),
                             get_eob(xd, segment_id, 256),
                             xd->qcoeff, vp9_default_zig_zag1d_16x16,
                             TX_16X16, vp9_coef_bands_16x16);
-  A[1] = A[2] = A[3] = A[0];
-  L[1] = L[2] = L[3] = L[0];
+  A[1] = A[2] = A[3] = A[0] = above_ec;
+  L[1] = L[2] = L[3] = L[0] = left_ec;
  eobtotal += c;

  // 8x8 chroma blocks
  seg_eob = get_eob(xd, segment_id, 64);
  for (i = 16; i < 24; i += 4) {
-    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
-    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
-
-    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_8X8][i];
+    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_8X8][i];
+#if CONFIG_CNVCONTEXT
+    above_ec = (a[0] + a[1]) != 0;
+    left_ec = (l[0] + l[1]) != 0;
+#else
+    above_ec = a[0];
+    left_ec = l[0];
+#endif
+    eobs[i] = c = decode_coefs(pbi, xd, bc,
+                               &above_ec, &left_ec,
+                               PLANE_TYPE_UV,
                               DCT_DCT, seg_eob, xd->block[i].qcoeff,
                               vp9_default_zig_zag1d_8x8,
                               TX_8X8, vp9_coef_bands_8x8);
-    a[1] = a[0];
-    l[1] = l[0];
+    a[1] = a[0] = above_ec;
+    l[1] = l[0] = left_ec;
    eobtotal += c;
  }
  A[8] = 0;
@ -300,7 +373,7 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
                                    BOOL_DECODER* const bc) {
  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-  unsigned short *const eobs = xd->eobs;
+  uint16_t *const eobs = xd->eobs;
  PLANE_TYPE type;
  int c, i, eobtotal = 0, seg_eob;
  const int segment_id = xd->mode_info_context->mbmi.segment_id;
@ -308,18 +381,19 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
  int has_2nd_order = get_2nd_order_usage(xd);
  // 2nd order DC block
  if (has_2nd_order) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];
+    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][24];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][24];

    eobs[24] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_Y2,
                                DCT_DCT, get_eob(xd, segment_id, 4),
                                xd->block[24].qcoeff,
-                                vp9_default_zig_zag1d, TX_8X8, vp9_coef_bands);
+                                vp9_default_zig_zag1d_4x4, TX_8X8,
+                                vp9_coef_bands_4x4);
    eobtotal += c - 4;
    type = PLANE_TYPE_Y_NO_DC;
  } else {
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
    eobs[24] = 0;
    type = PLANE_TYPE_Y_WITH_DC;
  }
@ -327,17 +401,23 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
  // luma blocks
  seg_eob = get_eob(xd, segment_id, 64);
  for (i = 0; i < 16; i += 4) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
-
-    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, type,
+    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];
+#if CONFIG_CNVCONTEXT
+    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
+    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
+#else
+    ENTROPY_CONTEXT above_ec = a[0];
+    ENTROPY_CONTEXT left_ec = l[0];
+#endif
+    eobs[i] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec, type,
                               type == PLANE_TYPE_Y_WITH_DC ?
                               get_tx_type(xd, xd->block + i) : DCT_DCT,
                               seg_eob, xd->block[i].qcoeff,
                               vp9_default_zig_zag1d_8x8,
                               TX_8X8, vp9_coef_bands_8x8);
-    a[1] = a[0];
-    l[1] = l[0];
+    a[1] = a[0] = above_ec;
+    l[1] = l[0] = left_ec;
    eobtotal += c;
  }

@ -347,25 +427,34 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
    // use 4x4 transform for U, V components in I8X8/splitmv prediction mode
    seg_eob = get_eob(xd, segment_id, 16);
    for (i = 16; i < 24; i++) {
-      ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
-      ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
+      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];
+      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];

      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                                 vp9_default_zig_zag1d, TX_4X4, vp9_coef_bands);
+                                 vp9_default_zig_zag1d_4x4, TX_4X4,
+                                 vp9_coef_bands_4x4);
      eobtotal += c;
    }
  } else {
    for (i = 16; i < 24; i += 4) {
-      ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
-      ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
-
-      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];
+      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];
+#if CONFIG_CNVCONTEXT
+      ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
+      ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
+#else
+      ENTROPY_CONTEXT above_ec = a[0];
+      ENTROPY_CONTEXT left_ec = l[0];
+#endif
+      eobs[i] = c = decode_coefs(pbi, xd, bc,
+                                 &above_ec, &left_ec,
+                                 PLANE_TYPE_UV,
                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,
                                 vp9_default_zig_zag1d_8x8,
                                 TX_8X8, vp9_coef_bands_8x8);
-      a[1] = a[0];
-      l[1] = l[0];
+      a[1] = a[0] = above_ec;
+      l[1] = l[0] = left_ec;
      eobtotal += c;
    }
  }
@ -373,50 +462,77 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
  return eobtotal;
 }

+static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
+                            BOOL_DECODER* const bc,
+                            PLANE_TYPE type, int i, int seg_eob,
+                            TX_TYPE tx_type, const int *scan) {
+  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
+  ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];
+  ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];
+  uint16_t *const eobs = xd->eobs;
+  int c;
+
+  c = decode_coefs(dx, xd, bc, a, l, type, tx_type, seg_eob,
+                   xd->block[i].qcoeff, scan, TX_4X4, vp9_coef_bands_4x4);
+  eobs[i] = c;
+
+  return c;
+}
+
+static int decode_coefs_4x4_y(VP9D_COMP *dx, MACROBLOCKD *xd,
+                              BOOL_DECODER* const bc,
+                              PLANE_TYPE type, int i, int seg_eob) {
+  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+                          get_tx_type(xd, &xd->block[i]) : DCT_DCT;
+  const int *scan;
+
+  switch (tx_type) {
+    case ADST_DCT:
+      scan = vp9_row_scan_4x4;
+      break;
+    case DCT_ADST:
+      scan = vp9_col_scan_4x4;
+      break;
+    default:
+      scan = vp9_default_zig_zag1d_4x4;
+      break;
+  }
+
+  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob, tx_type, scan);
+}
+
 int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
                         BOOL_DECODER* const bc,
                         PLANE_TYPE type, int i) {
-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-  ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
-  ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
-  INT16 *qcoeff_ptr = &xd->qcoeff[0];
-  const int *scan = vp9_default_zig_zag1d;
-  unsigned short *const eobs = xd->eobs;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int c, seg_eob = get_eob(xd, segment_id, 16);
-  TX_TYPE tx_type = DCT_DCT;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_eob = get_eob(xd, segment_id, 16);

-  if (type == PLANE_TYPE_Y_WITH_DC)
-    tx_type = get_tx_type_4x4(xd, &xd->block[i]);
-  switch (tx_type) {
-    case ADST_DCT :
-      scan = vp9_row_scan;
-      break;
+  return decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);
+}

-    case DCT_ADST :
-      scan = vp9_col_scan;
-      break;
+static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
+                                   MACROBLOCKD* const xd,
+                                   BOOL_DECODER* const bc,
+                                   int seg_eob) {
+  int eobtotal = 0, i;

-    default :
-      scan = vp9_default_zig_zag1d;
-      break;
+  // chroma blocks
+  for (i = 16; i < 24; i++) {
+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob,
+                                 DCT_DCT, vp9_default_zig_zag1d_4x4);
  }
-  eobs[i] = c = decode_coefs(dx, xd, bc, a, l, type,
-                             tx_type, seg_eob, qcoeff_ptr + i * 16,
-                             scan, TX_4X4, vp9_coef_bands);
-  return c;
+
+  return eobtotal;
 }

 int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
                                MACROBLOCKD* const xd,
                                BOOL_DECODER* const bc) {
-  int eobtotal = 0, i;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_eob = get_eob(xd, segment_id, 16);

-  for (i = 16; i < 24; i++)
-    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i);
-
-  return eobtotal;
+  return decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
 }

 static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
@ -424,24 +540,31 @@ static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
                                    BOOL_DECODER* const bc) {
  int i, eobtotal = 0;
  PLANE_TYPE type;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_eob = get_eob(xd, segment_id, 16);
+  const int has_2nd_order = get_2nd_order_usage(xd);

-  int has_2nd_order = get_2nd_order_usage(xd);
-
+  // 2nd order DC block
  if (has_2nd_order) {
-    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24) - 16;
+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24, seg_eob,
+                                 DCT_DCT, vp9_default_zig_zag1d_4x4) - 16;
    type = PLANE_TYPE_Y_NO_DC;
  } else {
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
    xd->eobs[24] = 0;
    type = PLANE_TYPE_Y_WITH_DC;
  }

+  // luma blocks
  for (i = 0; i < 16; ++i) {
-    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, type, i);
+    eobtotal += decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);
  }

-  return eobtotal + vp9_decode_mb_tokens_4x4_uv(dx, xd, bc);
+  // chroma blocks
+  eobtotal += decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
+
+  return eobtotal;
 }

 int vp9_decode_mb_tokens(VP9D_COMP* const dx,
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@ -23,7 +23,11 @@ int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
 int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
                         BOOL_DECODER* const);

+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc);
+
 int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
                                BOOL_DECODER* const bc);

-#endif /* DETOKENIZE_H */
+#endif  // VP9_DECODER_VP9_DETOKENIZE_H_
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@ -14,11 +14,11 @@
 #include "vp9/decoder/vp9_dequantize.h"
 #endif

-void vp9_dequant_dc_idct_add_y_block_c(short *q, const short *dq,
-                                       unsigned char *pre,
-                                       unsigned char *dst,
-                                       int stride, unsigned short *eobs,
-                                       const short *dc) {
+void vp9_dequant_dc_idct_add_y_block_c(int16_t *q, const int16_t *dq,
+                                       uint8_t *pre,
+                                       uint8_t *dst,
+                                       int stride, uint16_t *eobs,
+                                       const int16_t *dc) {
  int i, j;

  for (i = 0; i < 4; i++) {
@ -39,12 +39,12 @@ void vp9_dequant_dc_idct_add_y_block_c(short *q, const short *dq,
  }
 }

-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, const short *dq,
-                                                   unsigned char *dst,
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
+                                                   const int16_t *dq,
+                                                   uint8_t *dst,
                                                   int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                   MACROBLOCKD *xd) {
  int i, j;

@ -63,12 +63,11 @@ void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, const short *dq,
    dst += 4 * stride - 16;
  }
 }
-#endif

-void vp9_dequant_idct_add_y_block_c(short *q, const short *dq,
-                                    unsigned char *pre,
-                                    unsigned char *dst,
-                                    int stride, unsigned short *eobs) {
+void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,
+                                    uint8_t *pre,
+                                    uint8_t *dst,
+                                    int stride, uint16_t *eobs) {
  int i, j;

  for (i = 0; i < 4; i++) {
@ -90,10 +89,10 @@ void vp9_dequant_idct_add_y_block_c(short *q, const short *dq,
  }
 }

-void vp9_dequant_idct_add_uv_block_c(short *q, const short *dq,
-                                     unsigned char *pre, unsigned char *dstu,
-                                     unsigned char *dstv, int stride,
-                                     unsigned short *eobs) {
+void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
+                                     uint8_t *pre, uint8_t *dstu,
+                                     uint8_t *dstv, int stride,
+                                     uint16_t *eobs) {
  int i, j;

  for (i = 0; i < 2; i++) {
@ -133,12 +132,11 @@ void vp9_dequant_idct_add_uv_block_c(short *q, const short *dq,
  }
 }

-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, const short *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                 uint8_t *dstu,
+                                                 uint8_t *dstv,
                                                 int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                 MACROBLOCKD *xd) {
  int i, j;

@ -174,13 +172,12 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, const short *dq,
    dstv += 4 * stride - 8;
  }
 }
-#endif

-void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, const short *dq,
-                                           unsigned char *pre,
-                                           unsigned char *dst,
-                                           int stride, unsigned short *eobs,
-                                           const short *dc,
+void vp9_dequant_dc_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,
+                                           uint8_t *pre,
+                                           uint8_t *dst,
+                                           int stride, uint16_t *eobs,
+                                           const int16_t *dc,
                                           MACROBLOCKD *xd) {
  q[0] = dc[0];
  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);
@ -199,12 +196,12 @@ void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, const short *dq,
                                xd->eobs[12]);
 }

-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, const short *dq,
-                                                   unsigned char *dst,
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q,
+                                                   const int16_t *dq,
+                                                   uint8_t *dst,
                                                   int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                   MACROBLOCKD *xd) {
  q[0] = dc[0];
  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);
@ -223,15 +220,14 @@ void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, const short *dq,
                                dst + 8 * stride + 8, stride, stride, 1,
                                xd->eobs[12]);
 }
-#endif

-void vp9_dequant_idct_add_y_block_8x8_c(short *q, const short *dq,
-                                        unsigned char *pre,
-                                        unsigned char *dst,
-                                        int stride, unsigned short *eobs,
+void vp9_dequant_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,
+                                        uint8_t *pre,
+                                        uint8_t *dst,
+                                        int stride, uint16_t *eobs,
                                        MACROBLOCKD *xd) {
-  unsigned char *origdest = dst;
-  unsigned char *origpred = pre;
+  uint8_t *origdest = dst;
+  uint8_t *origpred = pre;

  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);
  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
@ -243,11 +239,11 @@ void vp9_dequant_idct_add_y_block_8x8_c(short *q, const short *dq,
                             xd->eobs[12]);
 }

-void vp9_dequant_idct_add_uv_block_8x8_c(short *q, const short *dq,
-                                         unsigned char *pre,
-                                         unsigned char *dstu,
-                                         unsigned char *dstv,
-                                         int stride, unsigned short *eobs,
+void vp9_dequant_idct_add_uv_block_8x8_c(int16_t *q, const int16_t *dq,
+                                         uint8_t *pre,
+                                         uint8_t *dstu,
+                                         uint8_t *dstv,
+                                         int stride, uint16_t *eobs,
                                         MACROBLOCKD *xd) {
  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);

@ -257,12 +253,11 @@ void vp9_dequant_idct_add_uv_block_8x8_c(short *q, const short *dq,
  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);
 }

-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, const short *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                 uint8_t *dstu,
+                                                 uint8_t *dstv,
                                                 int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                 MACROBLOCKD *xd) {
  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,
                             xd->eobs[16]);
@ -271,15 +266,14 @@ void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, const short *dq,
  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,
                             xd->eobs[20]);
 }
-#endif

 #if CONFIG_LOSSLESS
-void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, const short *dq,
-                                                unsigned char *pre,
-                                                unsigned char *dst,
+void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                uint8_t *pre,
+                                                uint8_t *dst,
                                                int stride,
-                                                unsigned short *eobs,
-                                                const short *dc) {
+                                                uint16_t *eobs,
+                                                const int16_t *dc) {
  int i, j;

  for (i = 0; i < 4; i++) {
@ -300,10 +294,10 @@ void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, const short *dq,
  }
 }

-void vp9_dequant_idct_add_y_block_lossless_c(short *q, const short *dq,
-                                             unsigned char *pre,
-                                             unsigned char *dst,
-                                             int stride, unsigned short *eobs) {
+void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                             uint8_t *pre,
+                                             uint8_t *dst,
+                                             int stride, uint16_t *eobs) {
  int i, j;

  for (i = 0; i < 4; i++) {
@ -325,12 +319,12 @@ void vp9_dequant_idct_add_y_block_lossless_c(short *q, const short *dq,
  }
 }

-void vp9_dequant_idct_add_uv_block_lossless_c(short *q, const short *dq,
-                                              unsigned char *pre,
-                                              unsigned char *dstu,
-                                              unsigned char *dstv,
+void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
+                                              uint8_t *pre,
+                                              uint8_t *dstu,
+                                              uint8_t *dstv,
                                              int stride,
-                                              unsigned short *eobs) {
+                                              uint16_t *eobs) {
  int i, j;

  for (i = 0; i < 2; i++) {
--- a/vp9/decoder/vp9_onyxd.h
+++ b/vp9/decoder/vp9_onyxd.h
@ -8,17 +8,13 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_COMMON_VP9_ONYXD_H_
 #define VP9_COMMON_VP9_ONYXD_H_

-
 /* Create/destroy static data structures. */
 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #endif
-#include "vp9/common/vp9_type_aliases.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vpx_ports/mem.h"
@ -65,4 +61,4 @@ extern "C"
 }
 #endif

-#endif  // __INC_ONYXD_H
+#endif  // VP9_COMMON_VP9_ONYXD_H_
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@ -13,7 +13,7 @@
 #if CONFIG_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
-#include "vp9/common/vp9_onyxd.h"
+#include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_alloccommon.h"
@ -23,7 +23,7 @@
 #include <assert.h>

 #include "vp9/common/vp9_quant_common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vp9/decoder/vp9_decodframe.h"
@ -37,7 +37,7 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx);
 #if WRITE_RECON_BUFFER == 1
 static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {
  FILE *yuv_file = fopen((char *)name, "ab");
-  unsigned char *src = s->y_buffer;
+  uint8_t *src = s->y_buffer;
  int h = s->y_height;

  do {
@ -382,7 +382,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,

    if (cm->filter_level) {
      /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb);
+      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);
    }
    vp8_yv12_extend_frame_borders(cm->frame_to_show);
  }
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@ -8,11 +8,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #ifndef VP9_DECODER_VP9_ONYXD_INT_H_
 #define VP9_DECODER_VP9_ONYXD_INT_H_
 #include "./vpx_config.h"
-#include "vp9/common/vp9_onyxd.h"
+#include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_treereader.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/decoder/vp9_dequantize.h"
@ -35,22 +34,22 @@ typedef struct {
 typedef struct {
  int const *scan;
  int const *scan_8x8;
-  UINT8 const *ptr_block2leftabove;
+  uint8_t const *ptr_block2leftabove;
  vp9_tree_index const *vp9_coef_tree_ptr;
  unsigned char *norm_ptr;
-  UINT8 *ptr_coef_bands_x;
-  UINT8 *ptr_coef_bands_x_8x8;
+  uint8_t *ptr_coef_bands_x;
+  uint8_t *ptr_coef_bands_x_8x8;

  ENTROPY_CONTEXT_PLANES *A;
  ENTROPY_CONTEXT_PLANES *L;

-  INT16 *qcoeff_start_ptr;
+  int16_t *qcoeff_start_ptr;

-  vp9_prob const *coef_probs[BLOCK_TYPES];
+  vp9_prob const *coef_probs_4x4[BLOCK_TYPES_4X4];
  vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
  vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];

-  UINT8 eob[25];
+  uint8_t eob[25];

 } DETOK;

@ -103,4 +102,4 @@ int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end);
  } while(0)
 #endif

-#endif  // __INC_ONYXD_INT_H
+#endif  // VP9_DECODER_VP9_TREEREADER_H_
--- a/vp9/decoder/vp9_reconintra_mt.h
+++ b/vp9/decoder/vp9_reconintra_mt.h
@ -1,15 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_RECONINTRA_MT_H_
-#define VP9_DECODER_VP9_RECONINTRA_MT_H_
-
-#endif
--- a/vp9/decoder/vp9_treereader.h
+++ b/vp9/decoder/vp9_treereader.h
@ -34,4 +34,4 @@ static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
  return -i;
 }

-#endif /* tree_reader_h */
+#endif  // VP9_DECODER_VP9_TREEREADER_H_
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
--- a/Show More
+++ b/Show More