Corrected optimization of 8x8 DCT code

The 8x8 DCT uses a fast version whenever possible. There was a mistake in the checking code which meant sometimes the fast version was used when it was not safe to do so. Change-Id: I154c84c9e2d836764768a11082947ca30f4b5ab7
Merge "Added tests for high bitdepth variance sse2 functions" into highbitdepth
2014-12-11 15:54:23 +00:00 · 2014-11-10 14:42:26 -08:00 · 2014-11-10 20:42:24 +00:00 · 2014-11-10 10:47:39 -08:00 · 2014-11-10 16:17:49 +00:00 · 2014-11-06 13:57:04 -08:00
35 changed files with 9424 additions and 2644 deletions
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -264,6 +264,8 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

 typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
+    Idct16x16Param;

 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                   int /*tx_type*/) {
@@ -311,6 +313,32 @@ void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
 }
+
+void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_c(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+}
+
+void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+}
+#endif
 #endif

 class Trans16x16TestBase {
@@ -540,7 +568,7 @@ class Trans16x16TestBase {

      reference_16x16_dct_2d(in, out_r);
      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = round(out_r[j]);
+        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));

      if (bit_depth_ == VPX_BITS_8) {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
@@ -565,6 +593,62 @@ class Trans16x16TestBase {
      }
    }
  }
+  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                 pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 16x16 IDCT Comparison has error " << error
+            << " at index " << j;
+      }
+    }
+  }
  int pitch_;
  int tx_type_;
  vpx_bit_depth_t bit_depth_;
@@ -590,10 +674,10 @@ class Trans16x16DCT
    mask_ = (1 << bit_depth_) - 1;
 #if CONFIG_VP9_HIGHBITDEPTH
    switch (bit_depth_) {
-      case 10:
+      case VPX_BITS_10:
        inv_txfm_ref = idct16x16_10_ref;
        break;
-      case 12:
+      case VPX_BITS_12:
        inv_txfm_ref = idct16x16_12_ref;
        break;
      default:
@@ -703,6 +787,37 @@ TEST_P(Trans16x16HT, QuantCheck) {
  RunQuantCheck(429, 729);
 }

+class InvTrans16x16DCT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<Idct16x16Param> {
+ public:
+  virtual ~InvTrans16x16DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    pitch_    = 16;
+    mask_ = (1 << bit_depth_) - 1;
+}
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans16x16DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
@@ -772,6 +887,51 @@ INSTANTIATE_TEST_CASE_P(
                   VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct16x16_sse2,
+                   &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct16x16_c,
+                   &idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct16x16_sse2,
+                   &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct16x16_c,
+                   &idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct16x16_sse2,
+                   &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
+                   VPX_BITS_8)));
+// Optimizations take effect at a threshold of 3155, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans16x16DCT,
+    ::testing::Values(
+        make_tuple(&idct16x16_10_add_10_c,
+                   &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10,
+                   &idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10_add_12_c,
+                   &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
+        make_tuple(&idct16x16_12,
+                   &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+#endif
+
 #if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    SSSE3, Trans16x16DCT,
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -79,6 +79,10 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
    Trans32x32Param;

 #if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 8);
+}
+
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 10);
 }
@@ -114,7 +118,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  uint32_t max_error = 0;
  int64_t total_error = 0;
-  const int count_test_block = 1000;
+  const int count_test_block = 10000;
  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
@@ -127,7 +131,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-mask_, mask_].
    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (bit_depth_ == 8) {
+      if (bit_depth_ == VPX_BITS_8) {
        src[j] = rnd.Rand8();
        dst[j] = rnd.Rand8();
        test_input_block[j] = src[j] - dst[j];
@@ -282,7 +286,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {

    reference_32x32_dct_2d(in, out_r);
    for (int j = 0; j < kNumCoeffs; ++j)
-      coeff[j] = round(out_r[j]);
+      coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
    if (bit_depth_ == VPX_BITS_8) {
      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -353,6 +357,22 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
+                   VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
+                   VPX_BITS_12),
+        make_tuple(&vp9_fdct32x32_sse2, &vp9_idct32x32_1024_add_c, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_sse2, &vp9_idct32x32_1024_add_c, 1,
+                   VPX_BITS_8)));
+#endif
+
 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    AVX2, Trans32x32Test,
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int number_of_iterations = 1000;
+
+typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+                               const tran_low_t *dqcoeff, intptr_t block_size,
+                               int64_t *ssz, int bps);
+typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
+                        ErrorBlockParam;
+class ErrorBlockTest
+  : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+  virtual ~ErrorBlockTest() {}
+  virtual void SetUp() {
+    error_block_op_     = GET_PARAM(0);
+    ref_error_block_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  ErrorBlockFunc error_block_op_;
+  ErrorBlockFunc ref_error_block_op_;
+};
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int err_count = 0;
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      coeff[j]   = rnd(2<<20)-(1<<20);
+      dqcoeff[j] = rnd(2<<20)-(1<<20);
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Error Block Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  int max_val = ((1<<20)-1);
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int err_count = 0;
+    int k = (i / 9) % 5;
+
+    // Change the maximum coeff value, to test different bit boundaries
+    if ( k == 4 && (i % 9) == 0 ) {
+      max_val >>= 1;
+    }
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      if (k < 4) {  // Test at maximum values
+        coeff[j]   = k % 2 ? max_val : -max_val;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+      } else {
+        coeff[j]   = rnd(2 << 14) - (1 << 14);
+        dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+      }
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Error Block Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+  SSE2_C_COMPARE, ErrorBlockTest,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_block_error_sse2,
+               &vp9_highbd_block_error_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_block_error_sse2,
+               &vp9_highbd_block_error_c, VPX_BITS_12),
+    make_tuple(&vp9_highbd_block_error_sse2,
+               &vp9_highbd_block_error_c, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -75,6 +75,16 @@ void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
 void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
  vp9_highbd_iwht4x4_16_add_c(in, out, stride, 12);
 }
+
+#if HAVE_SSE2
+void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
+}
+
+void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
+}
+#endif
 #endif

 class Trans4x4TestBase {
@@ -496,4 +506,31 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct4x4_sse2,      &vp9_idct4x4_16_add_c, 0,
+                   VPX_BITS_8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif
 }  // namespace
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -71,6 +71,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

 typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;

 void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
  vp9_fdct8x8_c(in, out, stride);
@@ -96,6 +97,32 @@ void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
 }
+
+void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_c(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+}
+
+void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+}
+#endif
 #endif

 class FwdTrans8x8TestBase {
@@ -146,9 +173,10 @@ class FwdTrans8x8TestBase {
    memset(count_sign_block, 0, sizeof(count_sign_block));

    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-15, 15].
+      // Initialize a test block with input range [-mask_/16, mask_/16].
      for (int j = 0; j < 64; ++j)
-        test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+        test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
+                              ((rnd.Rand16() & mask_) >> 4);
      ASM_REGISTER_STATE_CHECK(
          RunFwdTxfm(test_input_block, test_output_block, pitch_));

@@ -188,7 +216,7 @@ class FwdTrans8x8TestBase {
 #endif

    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < 64; ++j) {
        if (bit_depth_ == VPX_BITS_8) {
          src[j] = rnd.Rand8();
@@ -427,6 +455,63 @@ class FwdTrans8x8TestBase {
      }
    }
  }
+
+void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 12;
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
+    const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
  int pitch_;
  int tx_type_;
  FhtFunc fwd_txfm_ref;
@@ -526,6 +611,38 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) {
  RunExtremalCheck();
 }

+class InvTrans8x8DCT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<Idct8x8Param> {
+ public:
+  virtual ~InvTrans8x8DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_   = GET_PARAM(2);
+    pitch_    = 8;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans8x8DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
@@ -598,6 +715,45 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
 #endif

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct8x8_c,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct8x8_c,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+
+// Optimizations take effect at a threshold of 6201, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&idct8x8_10_add_10_c,
+                   &idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10,
+                   &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10_add_12_c,
+                   &idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
+        make_tuple(&idct8x8_12,
+                   &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
+#endif
+
+
 #if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
    !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -23,6 +23,8 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_integer.h"

+#define MAX_LOOP_FILTER 63
+
 using libvpx_test::ACMRandom;

 namespace {
@@ -51,8 +53,9 @@ typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
                               const uint8_t *thresh1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
-typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+typedef std::tr1::tuple<loop_op_t, loop_op_t, vpx_bit_depth_t> loop8_param_t;
+typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t,
+                        vpx_bit_depth_t> dualloop8_param_t;

 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -119,7 +122,7 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  virtual void TearDown() { libvpx_test::ClearSystemState(); }

 protected:
-  int bit_depth_;
+  vpx_bit_depth_t bit_depth_;
  int mask_;
  loop_op_t loopfilter_op_;
  loop_op_t ref_loopfilter_op_;
@@ -138,7 +141,7 @@ class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  virtual void TearDown() { libvpx_test::ClearSystemState(); }

 protected:
-  int bit_depth_;
+  vpx_bit_depth_t bit_depth_;
  int mask_;
  dual_loop_op_t loopfilter_op_;
  dual_loop_op_t ref_loopfilter_op_;
@@ -148,7 +151,7 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = number_of_iterations;
 #if CONFIG_VP9_HIGHBITDEPTH
-  int32_t bd = bit_depth_;
+  vpx_bit_depth_t bd = bit_depth_;
  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
 #else
@@ -160,11 +163,18 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -211,7 +221,7 @@ TEST_P(Loop8Test6Param, OperationCheck) {
    ASM_REGISTER_STATE_CHECK(
        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count);
    ASM_REGISTER_STATE_CHECK(
        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -234,7 +244,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = number_of_iterations;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int32_t bd = bit_depth_;
+  vpx_bit_depth_t bd = bit_depth_;
  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
 #else
@@ -246,11 +256,17 @@ TEST_P(Loop8Test6Param, ValueCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    while (tmp > 3*MAX_LOOP_FILTER + 4) {  // mblim  <= 3*MAX_LOOP_FILTER + 4
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -271,7 +287,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
    ASM_REGISTER_STATE_CHECK(
        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count);
    ASM_REGISTER_STATE_CHECK(
        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -293,7 +309,7 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = number_of_iterations;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int32_t bd = bit_depth_;
+  vpx_bit_depth_t bd = bit_depth_;
  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
 #else
@@ -305,11 +321,19 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    // lim  <= MAX_LOOP_FILTER
+    while (tmp > MAX_LOOP_FILTER) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -320,11 +344,18 @@ TEST_P(Loop8Test9Param, OperationCheck) {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -407,11 +438,18 @@ TEST_P(Loop8Test9Param, ValueCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
    uint8_t tmp = rnd.Rand8();
+    // mblim  <= 3 * MAX_LOOP_FILTER + 4
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -422,11 +460,17 @@ TEST_P(Loop8Test9Param, ValueCheck) {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > 3 * MAX_LOOP_FILTER + 4) {  // mblim  <= 3*MAX_LOOP_FILTER + 4
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    tmp = rnd.Rand8();
+    while (tmp > MAX_LOOP_FILTER) {  // lim  <= MAX_LOOP_FILTER
+      tmp = rnd.Rand8();
+    }
    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -442,7 +486,7 @@ TEST_P(Loop8Test9Param, ValueCheck) {
      ref_s[j] = s[j];
    }
 #if CONFIG_VP9_HIGHBITDEPTH
-    const int32_t bd = bit_depth_;
+    vpx_bit_depth_t bd = bit_depth_;
    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
                       blimit1, limit1, thresh1, bd);
    ASM_REGISTER_STATE_CHECK(
@@ -477,48 +521,51 @@ INSTANTIATE_TEST_CASE_P(
    SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
    ::testing::Values(
        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 8),
+                   &vp9_highbd_lpf_horizontal_4_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 8),
+                   &vp9_highbd_lpf_vertical_4_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 8),
+                   &vp9_highbd_lpf_horizontal_8_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8),
+                   &vp9_highbd_lpf_horizontal_16_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 8),
+                   &vp9_highbd_lpf_vertical_8_c, VPX_BITS_8),
        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 8),
+                   &wrapper_vertical_16_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 10),
+                   &vp9_highbd_lpf_horizontal_4_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 10),
+                   &vp9_highbd_lpf_vertical_4_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 10),
+                   &vp9_highbd_lpf_horizontal_8_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10),
+                   &vp9_highbd_lpf_horizontal_16_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 10),
+                   &vp9_highbd_lpf_vertical_8_c, VPX_BITS_10),
        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 10),
+                   &wrapper_vertical_16_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 12),
+                   &vp9_highbd_lpf_horizontal_4_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 12),
+                   &vp9_highbd_lpf_vertical_4_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 12),
+                   &vp9_highbd_lpf_horizontal_8_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12),
+                   &vp9_highbd_lpf_horizontal_16_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 12),
+                   &vp9_highbd_lpf_vertical_8_c, VPX_BITS_12),
        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 12)));
+                   &wrapper_vertical_16_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
    SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
    ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8)));
+        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c,
+                   VPX_BITS_8),
+        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c,
+                   VPX_BITS_8),
+        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c,
+                   VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif

@@ -528,60 +575,61 @@ INSTANTIATE_TEST_CASE_P(
    SSE2_C_COMPARE_DUAL, Loop8Test6Param,
    ::testing::Values(
        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 8),
+                   &wrapper_vertical_16_dual_c, VPX_BITS_8),
        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 10),
+                   &wrapper_vertical_16_dual_c, VPX_BITS_10),
        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 12)));
+                   &wrapper_vertical_16_dual_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
    SSE2_C_COMPARE_DUAL, Loop8Test6Param,
    ::testing::Values(
-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c,
+                   VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2

 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
-    SSE_C_COMPARE_DUAL, Loop8Test9Param,
+    SSE2_C_COMPARE_DUAL, Loop8Test9Param,
    ::testing::Values(
        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 8),
+                   &vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 8),
+                   &vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 8),
+                   &vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 8),
+                   &vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_8),
        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 10),
+                   &vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 10),
+                   &vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 10),
+                   &vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 10),
+                   &vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_10),
        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 12),
+                   &vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 12),
+                   &vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 12),
+                   &vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 12)));
+                   &vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
-    SSE_C_COMPARE_DUAL, Loop8Test9Param,
+    SSE2_C_COMPARE_DUAL, Loop8Test9Param,
    ::testing::Values(
        make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
+                   &vp9_lpf_horizontal_4_dual_c, VPX_BITS_8),
        make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
+                   &vp9_lpf_horizontal_8_dual_c, VPX_BITS_8),
        make_tuple(&vp9_lpf_vertical_4_dual_sse2,
-                   &vp9_lpf_vertical_4_dual_c, 8),
+                   &vp9_lpf_vertical_4_dual_c, VPX_BITS_8),
        make_tuple(&vp9_lpf_vertical_8_dual_sse2,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
+                   &vp9_lpf_vertical_8_dual_c, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 }  // namespace
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -0,0 +1,353 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int number_of_iterations = 100;
+
+typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
+                           int skip_block, const int16_t *zbin,
+                           const int16_t *round, const int16_t *quant,
+                           const int16_t *quant_shift,
+                           tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                           const int16_t *dequant, int zbin_oq_value,
+                           uint16_t *eob, const int16_t *scan,
+                           const int16_t *iscan);
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+    QuantizeParam;
+class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~QuantizeTest() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+class Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~Quantize32Test() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+TEST_P(QuantizeTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,           1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+TEST_P(Quantize32Test, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,           1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = TX_32X32;
+    TX_TYPE tx_type = (TX_TYPE)(i % 4);
+
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+TEST_P(QuantizeTest, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,       1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    // Two random entries
+    for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+    }
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+TEST_P(Quantize32Test, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr,       1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  zbin_ptr,          2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  round_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_ptr,         2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  quant_shift_ptr,   2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr,      1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr,     1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr,  1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t,  dequant_ptr,       2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr,           1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr,       1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = TX_32X32;
+    TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+    }
+    // Two random entries
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+    << "Error: Quantization Test, C output doesn't match SSE2 output. "
+    << "First failed at test case " << first_failure;
+}
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+  SSE2_C_COMPARE, QuantizeTest,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_quantize_b_sse2,
+               &vp9_highbd_quantize_b_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_quantize_b_sse2,
+               &vp9_highbd_quantize_b_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_quantize_b_sse2,
+               &vp9_highbd_quantize_b_c, VPX_BITS_12)));
+INSTANTIATE_TEST_CASE_P(
+  SSE2_C_COMPARE, Quantize32Test,
+  ::testing::Values(
+    make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+               &vp9_highbd_quantize_b_32x32_c, VPX_BITS_8),
+    make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+               &vp9_highbd_quantize_b_32x32_c, VPX_BITS_10),
+    make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+               &vp9_highbd_quantize_b_32x32_c, VPX_BITS_12)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
--- a/test/test.mk
+++ b/test/test.mk
@@ -134,6 +134,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += quantize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc

 ifeq ($(CONFIG_VP9_ENCODER),yes)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/tools_common.c
+++ b/tools_common.c
@@ -276,7 +276,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
  // Note the offset is 1 less than half.
  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
  int plane;
-  if (dst->w != src->w || dst->h != src->h ||
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
      dst->x_chroma_shift != src->x_chroma_shift ||
      dst->y_chroma_shift != src->y_chroma_shift ||
      dst->fmt != src->fmt || input_shift < 0) {
@@ -293,12 +293,12 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
      break;
  }
  for (plane = 0; plane < 3; plane++) {
-    int w = src->w;
-    int h = src->h;
+    int w = src->d_w;
+    int h = src->d_h;
    int x, y;
    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
    }
    for (y = 0; y < h; y++) {
      uint16_t *p_src =
@@ -316,7 +316,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
  // Note the offset is 1 less than half.
  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
  int plane;
-  if (dst->w != src->w || dst->h != src->h ||
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
      dst->x_chroma_shift != src->x_chroma_shift ||
      dst->y_chroma_shift != src->y_chroma_shift ||
      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
@@ -334,8 +334,8 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
      break;
  }
  for (plane = 0; plane < 3; plane++) {
-    int w = src->w;
-    int h = src->h;
+    int w = src->d_w;
+    int h = src->d_h;
    int x, y;
    if (plane) {
      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
@@ -384,8 +384,8 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
    int h = src->d_h;
    int x, y;
    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
    }
    for (y = 0; y < h; y++) {
      uint16_t *p_src =
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -15,36 +15,6 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"

-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows  in the inverse transform is considered invalid in VP9,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) (x)
-#endif  // CONFIG_EMULATE_HARDWARE
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
-                                             int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
  trans = WRAPLOW(trans, 8);
  return clip_pixel(WRAPLOW(dest + trans, 8));
@@ -276,10 +246,10 @@ void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 static void iadst4(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_high_t x0 = input[0];
-  tran_high_t x1 = input[1];
-  tran_high_t x2 = input[2];
-  tran_high_t x3 = input[3];
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];

  if (!(x0 | x1 | x2 | x3)) {
    output[0] = output[1] = output[2] = output[3] = 0;
@@ -295,24 +265,19 @@ static void iadst4(const tran_low_t *input, tran_low_t *output) {
  s6 = sinpi_4_9 * x3;
  s7 = x0 - x2 + x3;

-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;

  // 1-D transform scaling factor is sqrt(2).
  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  // + 1b (addition) = 29b.
  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
 }

 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -1545,7 +1510,7 @@ void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
  }
 }

-static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
  tran_low_t step[4];
  tran_high_t temp1, temp2;
  (void) bd;
@@ -1576,7 +1541,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

  // Rows
  for (i = 0; i < 4; ++i) {
-    highbd_idct4(input, outptr, bd);
+    vp9_highbd_idct4(input, outptr, bd);
    input += 4;
    outptr += 4;
  }
@@ -1585,7 +1550,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
  for (i = 0; i < 4; ++i) {
    for (j = 0; j < 4; ++j)
      temp_in[j] = out[j * 4 + i];
-    highbd_idct4(temp_in, temp_out, bd);
+    vp9_highbd_idct4(temp_in, temp_out, bd);
    for (j = 0; j < 4; ++j) {
      dest[j * stride + i] = highbd_clip_pixel_add(
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
@@ -1612,7 +1577,7 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
  }
 }

-static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
  tran_low_t step1[8], step2[8];
  tran_high_t temp1, temp2;
  // stage 1
@@ -1630,7 +1595,7 @@ static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

  // stage 2 & stage 3 - even half
-  highbd_idct4(step1, step1, bd);
+  vp9_highbd_idct4(step1, step1, bd);

  // stage 2 - odd half
  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
@@ -1667,7 +1632,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

  // First transform rows.
  for (i = 0; i < 8; ++i) {
-    highbd_idct8(input, outptr, bd);
+    vp9_highbd_idct8(input, outptr, bd);
    input += 8;
    outptr += 8;
  }
@@ -1676,7 +1641,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
  for (i = 0; i < 8; ++i) {
    for (j = 0; j < 8; ++j)
      temp_in[j] = out[j * 8 + i];
-    highbd_idct8(temp_in, temp_out, bd);
+    vp9_highbd_idct8(temp_in, temp_out, bd);
    for (j = 0; j < 8; ++j) {
      dest[j * stride + i] = highbd_clip_pixel_add(
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1702,10 +1667,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
 static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_high_t x0 = input[0];
-  tran_high_t x1 = input[1];
-  tran_high_t x2 = input[2];
-  tran_high_t x3 = input[3];
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
  (void) bd;

  if (!(x0 | x1 | x2 | x3)) {
@@ -1720,34 +1685,29 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
  s4 = sinpi_1_9 * x2;
  s5 = sinpi_2_9 * x3;
  s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
+  s7 = (tran_high_t)(x0 - x2 + x3);

-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;

  // 1-D transform scaling factor is sqrt(2).
  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  // + 1b (addition) = 29b.
  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
-  output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd);
  output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
-  output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
 }

 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                int stride, int tx_type, int bd) {
  const highbd_transform_2d IHT_4[] = {
-    { highbd_idct4, highbd_idct4  },    // DCT_DCT  = 0
-    { highbd_iadst4, highbd_idct4 },    // ADST_DCT = 1
-    { highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2
+    { vp9_highbd_idct4, vp9_highbd_idct4  },    // DCT_DCT  = 0
+    { highbd_iadst4, vp9_highbd_idct4 },    // ADST_DCT = 1
+    { vp9_highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2
    { highbd_iadst4, highbd_iadst4 }    // ADST_ADST = 3
  };
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1779,14 +1739,14 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
+  tran_low_t x0 = input[7];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[5];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[3];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[1];
+  tran_low_t x7 = input[6];
  (void) bd;

  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
@@ -1854,9 +1814,9 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
 }

 static const highbd_transform_2d HIGH_IHT_8[] = {
-  { highbd_idct8,  highbd_idct8  },  // DCT_DCT  = 0
-  { highbd_iadst8, highbd_idct8  },  // ADST_DCT = 1
-  { highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2
+  { vp9_highbd_idct8,  vp9_highbd_idct8  },  // DCT_DCT  = 0
+  { highbd_iadst8, vp9_highbd_idct8  },  // ADST_DCT = 1
+  { vp9_highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2
  { highbd_iadst8, highbd_iadst8 }   // ADST_ADST = 3
 };

@@ -1899,7 +1859,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
  // First transform rows.
  // Only first 4 row has non-zero coefs.
  for (i = 0; i < 4; ++i) {
-    highbd_idct8(input, outptr, bd);
+    vp9_highbd_idct8(input, outptr, bd);
    input += 8;
    outptr += 8;
  }
@@ -1907,7 +1867,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
  for (i = 0; i < 8; ++i) {
    for (j = 0; j < 8; ++j)
      temp_in[j] = out[j * 8 + i];
-    highbd_idct8(temp_in, temp_out, bd);
+    vp9_highbd_idct8(temp_in, temp_out, bd);
    for (j = 0; j < 8; ++j) {
      dest[j * stride + i] = highbd_clip_pixel_add(
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1915,7 +1875,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
  }
 }

-static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
  tran_low_t step1[16], step2[16];
  tran_high_t temp1, temp2;
  (void) bd;
@@ -2091,7 +2051,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

  // First transform rows.
  for (i = 0; i < 16; ++i) {
-    highbd_idct16(input, outptr, bd);
+    vp9_highbd_idct16(input, outptr, bd);
    input += 16;
    outptr += 16;
  }
@@ -2100,7 +2060,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
  for (i = 0; i < 16; ++i) {
    for (j = 0; j < 16; ++j)
      temp_in[j] = out[j * 16 + i];
-    highbd_idct16(temp_in, temp_out, bd);
+    vp9_highbd_idct16(temp_in, temp_out, bd);
    for (j = 0; j < 16; ++j) {
      dest[j * stride + i] = highbd_clip_pixel_add(
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2113,22 +2073,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  tran_high_t s9, s10, s11, s12, s13, s14, s15;

-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
+  tran_low_t x0 = input[15];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[13];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[11];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[9];
+  tran_low_t x7 = input[6];
+  tran_low_t x8 = input[7];
+  tran_low_t x9 = input[8];
+  tran_low_t x10 = input[5];
+  tran_low_t x11 = input[10];
+  tran_low_t x12 = input[3];
+  tran_low_t x13 = input[12];
+  tran_low_t x14 = input[1];
+  tran_low_t x15 = input[14];
  (void) bd;

  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
@@ -2280,9 +2240,9 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
 }

 static const highbd_transform_2d HIGH_IHT_16[] = {
-  { highbd_idct16,  highbd_idct16  },  // DCT_DCT  = 0
-  { highbd_iadst16, highbd_idct16  },  // ADST_DCT = 1
-  { highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2
+  { vp9_highbd_idct16,  vp9_highbd_idct16  },  // DCT_DCT  = 0
+  { highbd_iadst16, vp9_highbd_idct16  },  // ADST_DCT = 1
+  { vp9_highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2
  { highbd_iadst16, highbd_iadst16 }   // ADST_ADST = 3
 };

@@ -2325,7 +2285,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
  // First transform rows. Since all non-zero dct coefficients are in
  // upper-left 4x4 area, we only need to calculate first 4 rows here.
  for (i = 0; i < 4; ++i) {
-    highbd_idct16(input, outptr, bd);
+    vp9_highbd_idct16(input, outptr, bd);
    input += 16;
    outptr += 16;
  }
@@ -2334,7 +2294,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
  for (i = 0; i < 16; ++i) {
    for (j = 0; j < 16; ++j)
      temp_in[j] = out[j*16 + i];
-    highbd_idct16(temp_in, temp_out, bd);
+    vp9_highbd_idct16(temp_in, temp_out, bd);
    for (j = 0; j < 16; ++j) {
      dest[j * stride + i] = highbd_clip_pixel_add(
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -116,6 +116,28 @@ typedef struct {
 } highbd_transform_2d;
 #endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) (x)
+#endif  // CONFIG_EMULATE_HARDWARE
+
 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -135,6 +157,9 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                      int stride, int eob);

 #if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                            int eob, int bd);
 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -151,6 +176,11 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
                           uint8_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
                             uint8_t *dest, int stride, int eob, int bd);
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -750,27 +750,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct4x4_1_add/;

-  add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct4x4_16_add/;
-
  add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct8x8_1_add/;

-  add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct8x8_64_add/;
-
-  add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct8x8_10_add/;
-
  add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct16x16_1_add/;

-  add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct16x16_256_add/;
-
-  add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct16x16_10_add/;
-
  add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct32x32_1024_add/;

@@ -796,6 +781,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_iwht4x4_16_add/;
+
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+
+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct4x4_16_add/;
+
+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_64_add/;
+
+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_10_add/;
+
+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_256_add/;
+
+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_10_add/;
+
+  } else {
+
+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct4x4_16_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_64_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_10_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_256_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_10_add sse2/;
+  }
 }

 #
@@ -1114,6 +1135,11 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2/;

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_8x8/;
+}
+
 # ENCODEMB INVOKE

 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
@@ -1176,43 +1202,43 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4/;
+  specialize qw/vp9_fht4x4 sse2/;

  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8/;
+  specialize qw/vp9_fht8x8 sse2/;

  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16/;
+  specialize qw/vp9_fht16x16 sse2/;

  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/;
+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";

  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4_1/;
+  specialize qw/vp9_fdct4x4_1 sse2/;

  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4/;
+  specialize qw/vp9_fdct4x4 sse2/;

  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8_1/;
+  specialize qw/vp9_fdct8x8_1 sse2/;

  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8/;
+  specialize qw/vp9_fdct8x8 sse2/;

  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16_1/;
+  specialize qw/vp9_fdct16x16_1 sse2/;

  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16/;
+  specialize qw/vp9_fdct16x16 sse2/;

  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_1/;
+  specialize qw/vp9_fdct32x32_1 sse2/;

  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32/;
+  specialize qw/vp9_fdct32x32 sse2/;

  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_rd/;
+  specialize qw/vp9_fdct32x32_rd sse2/;
 } else {
  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  specialize qw/vp9_fht4x4 sse2/;
@@ -1278,34 +1304,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  # variance
  add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x16/;
+  specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x32/;
+  specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x32/;
+  specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x64/;
+  specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x32/;
+  specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x64/;
+  specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x16/;
+  specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x8/;
+  specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x16/;
+  specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x8/;
+  specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance8x4/;
@@ -1317,40 +1343,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_variance4x4/;

  add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get8x8var/;
+  specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";

  add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get16x16var/;
+  specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x16/;
+  specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x32/;
+  specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x32/;
+  specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x64/;
+  specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x32/;
+  specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x64/;
+  specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x16/;
+  specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x8/;
+  specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x16/;
+  specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x8/;
+  specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance8x4/;
@@ -1362,40 +1388,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_10_variance4x4/;

  add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get8x8var/;
+  specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";

  add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get16x16var/;
+  specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x16/;
+  specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x32/;
+  specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x32/;
+  specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x64/;
+  specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x32/;
+  specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x64/;
+  specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x16/;
+  specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x8/;
+  specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x16/;
+  specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x8/;
+  specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance8x4/;
@@ -1407,76 +1433,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_12_variance4x4/;

  add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get8x8var/;
+  specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";

  add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get16x16var/;
+  specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@@ -1491,70 +1517,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@@ -1569,70 +1595,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@@ -1647,37 +1673,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;

  add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad64x64/;
+  specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad32x64/;
+  specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad64x32/;
+  specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad32x16/;
+  specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad16x32/;
+  specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad32x32/;
+  specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad16x16/;
+  specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad16x8/;
+  specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad8x16/;
+  specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad8x8/;
+  specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad8x4/;
+  specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
  specialize qw/vp9_highbd_sad4x8/;
@@ -1686,37 +1712,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_sad4x4/;

  add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad64x64_avg/;
+  specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x64_avg/;
+  specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad64x32_avg/;
+  specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x16_avg/;
+  specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x32_avg/;
+  specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x32_avg/;
+  specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x16_avg/;
+  specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x8_avg/;
+  specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x16_avg/;
+  specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x8_avg/;
+  specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x4_avg/;
+  specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
  specialize qw/vp9_highbd_sad4x8_avg/;
@@ -1773,47 +1799,46 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_sad4x4x8/;

  add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad64x64x4d/;
+  specialize qw/vp9_highbd_sad64x64x4d sse2/;

  add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x64x4d/;
+  specialize qw/vp9_highbd_sad32x64x4d sse2/;

  add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad64x32x4d/;
+  specialize qw/vp9_highbd_sad64x32x4d sse2/;

  add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x16x4d/;
+  specialize qw/vp9_highbd_sad32x16x4d sse2/;

  add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x32x4d/;
+  specialize qw/vp9_highbd_sad16x32x4d sse2/;

  add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x32x4d/;
+  specialize qw/vp9_highbd_sad32x32x4d sse2/;

  add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x16x4d/;
+  specialize qw/vp9_highbd_sad16x16x4d sse2/;

  add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x8x4d/;
+  specialize qw/vp9_highbd_sad16x8x4d sse2/;

  add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x16x4d/;
+  specialize qw/vp9_highbd_sad8x16x4d sse2/;

  add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x8x4d/;
+  specialize qw/vp9_highbd_sad8x8x4d sse2/;

-  # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
  add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x4x4d/;
+  specialize qw/vp9_highbd_sad8x4x4d sse2/;

  add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad4x8x4d/;
+  specialize qw/vp9_highbd_sad4x8x4d sse2/;

  add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad4x4x4d/;
+  specialize qw/vp9_highbd_sad4x4x4d sse2/;

  add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse16x16/;
+  specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_mse8x16/;
@@ -1822,10 +1847,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_mse16x8/;

  add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse8x8/;
+  specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse16x16/;
+  specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_mse8x16/;
@@ -1834,10 +1859,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_10_mse16x8/;

  add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse8x8/;
+  specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse16x16/;
+  specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";

  add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_mse8x16/;
@@ -1846,12 +1871,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_12_mse16x8/;

  add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse8x8/;
+  specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";

  # ENCODEMB INVOKE

  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error/;
+  specialize qw/vp9_highbd_block_error sse2/;

  add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
  specialize qw/vp9_highbd_subtract_block/;
@@ -1863,10 +1888,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_quantize_fp_32x32/;

  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b/;
+  specialize qw/vp9_highbd_quantize_b sse2/;

  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b_32x32/;
+  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;

  #
  # Structured Similarity (SSIM)
@@ -1878,40 +1903,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  # fdct functions
  add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht4x4/;
+  specialize qw/vp9_highbd_fht4x4 sse2/;

  add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht8x8/;
+  specialize qw/vp9_highbd_fht8x8 sse2/;

  add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht16x16/;
+  specialize qw/vp9_highbd_fht16x16 sse2/;

  add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fwht4x4/;

  add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct4x4/;
+  specialize qw/vp9_highbd_fdct4x4 sse2/;

  add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct8x8_1/;

  add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct8x8/;
+  specialize qw/vp9_highbd_fdct8x8 sse2/;

  add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct16x16_1/;

  add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct16x16/;
+  specialize qw/vp9_highbd_fdct16x16 sse2/;

  add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct32x32_1/;

  add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct32x32/;
+  specialize qw/vp9_highbd_fdct32x32 sse2/;

  add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct32x32_rd/;
+  specialize qw/vp9_highbd_fdct32x32_rd sse2/;

  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
  specialize qw/vp9_highbd_temporal_filter_apply/;
--- a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
@@ -14,6 +14,10 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"

+static INLINE __m128i highbd_abs_diff(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
    __m128i ubounded;
    __m128i lbounded;
@@ -35,8 +39,126 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
    return retval;
 }

-// TODO(debargha, peter): Break up large functions into smaller ones
-// in this file.
+static INLINE void get_hev_and_mask(const __m128i thresh, const __m128i limit,
+                                    const __m128i blimit, const __m128i zero,
+                                    const __m128i one, const __m128i ffff,
+                                    __m128i abs_p1p0, __m128i abs_q1q0,
+                                    __m128i abs_p0q0, __m128i abs_p1q1,
+                                    __m128i abs_p2p1, __m128i abs_q2q1,
+                                    __m128i abs_p3p2, __m128i abs_q3q2,
+                                    __m128i* hev, __m128i* mask) {
+  __m128i work0, work1, work2;
+
+  //  highbd_hev_mask
+  work0 = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  *hev = _mm_subs_epu16(work0, thresh);
+  *hev = _mm_xor_si128(_mm_cmpeq_epi16(*hev, zero), ffff);
+
+  work1 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
+  work2 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
+  *mask = _mm_subs_epu16(_mm_adds_epu16(work1, work2), blimit);
+  *mask = _mm_xor_si128(_mm_cmpeq_epi16(*mask, zero), ffff);
+  *mask = _mm_and_si128(*mask, _mm_adds_epu16(limit, one));
+  *mask = _mm_max_epi16(work0, *mask);
+  work0 = _mm_max_epi16(abs_p2p1, abs_q2q1);
+  *mask = _mm_max_epi16(work0, *mask);
+  work0 = _mm_max_epi16(abs_p3p2, abs_q3q2);
+  *mask = _mm_max_epi16(work0, *mask);
+
+  *mask = _mm_subs_epu16(*mask, limit);
+  *mask = _mm_cmpeq_epi16(*mask, zero);  // return ~mask
+}
+static INLINE void highbd_filter4_sse2(const __m128i mask, const __m128i hev,
+                                       const __m128i p1, const __m128i p0,
+                                       const __m128i q1, const __m128i q0,
+                                       __m128i *ps1, __m128i *ps0,
+                                       __m128i *qs0, __m128i *qs1,
+                                       int bd) {
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  __m128i filt, work, filter1, filter2;
+
+  *ps1 = _mm_subs_epi16(p1, t80);
+  *qs1 = _mm_subs_epi16(q1, t80);
+  *ps0 = _mm_subs_epi16(p0, t80);
+  *qs0 = _mm_subs_epi16(q0, t80);
+
+  filt = _mm_and_si128(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(*ps1, *qs1), bd), hev);
+  work = _mm_subs_epi16(*qs0, *ps0);
+  filt = _mm_adds_epi16(filt, work);
+  filt = _mm_adds_epi16(filt, work);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work), bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  filter1 = _mm_srai_epi16(filter1, 0x3);
+  filter2 = _mm_srai_epi16(filter2, 0x3);
+
+  *qs0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(*qs0, filter1), bd),
+      t80);
+  *ps0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(*ps0, filter2), bd),
+      t80);
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(hev, filt);
+
+  *qs1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(*qs1, filt), bd),
+      t80);
+  *ps1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(*ps1, filt), bd),
+      t80);
+}
+
+static INLINE void apply_7tap_filter(const __m128i p3, const __m128i p2,
+                                     const __m128i p1, const __m128i p0,
+                                     const __m128i q0, const __m128i q1,
+                                     const __m128i q2, const __m128i q3,
+                                     uint16_t* flat_op2, uint16_t* flat_op1,
+                                     uint16_t* flat_op0, uint16_t* flat_oq0,
+                                     uint16_t* flat_oq1, uint16_t* flat_oq2) {
+  __m128i workp_a, workp_b, workp_shft;
+  const __m128i four = _mm_set1_epi16(4);
+
+  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)flat_op2, workp_shft);
+
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)flat_op1, workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)flat_op0, workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)flat_oq0, workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)flat_oq1, workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)flat_oq2, workp_shft);
+}
+
 static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
                                                   int p,
                                                   const uint8_t *_blimit,
@@ -45,6 +167,7 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
                                                   int bd) {
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_cmpeq_epi16(zero, zero);
  const __m128i blimit = _mm_slli_epi16(
      _mm_unpacklo_epi8(
          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
@@ -56,8 +179,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
  __m128i ps1, qs1, ps0, qs0;
-  __m128i abs_p0q0, abs_p1q1, ffff, work;
-  __m128i filt, work_a, filter1, filter2;
+  __m128i abs_p0q0, abs_p1q1, work;
+  __m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
  __m128i flat2_q0, flat2_p0;
@@ -65,7 +188,6 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
  __m128i pixelFilter_p, pixelFilter_q;
  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
-  __m128i t4, t3, t80, t1;
  __m128i eight, four;

  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
@@ -80,98 +202,25 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
  p0 = _mm_load_si128((__m128i *)(s - 1 * p));

  //  highbd_filter_mask
-  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+  abs_p1p0 = highbd_abs_diff(p1, p0);
+  abs_q1q0 = highbd_abs_diff(q1, q0);
+  abs_p0q0 = highbd_abs_diff(p0, q0);
+  abs_p1q1 = highbd_abs_diff(p1, q1);
+  abs_p2p1 = highbd_abs_diff(p2, p1);
+  abs_q2q1 = highbd_abs_diff(q2, q1);
+  abs_p3p2 = highbd_abs_diff(p3, p2);
+  abs_q3q2 = highbd_abs_diff(q3, q2);

-  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
-
-  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
-  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
-
-  //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
-                                    _mm_subs_epu16(p0, p1)),
-                       _mm_or_si128(_mm_subs_epu16(q1, q0),
-                                    _mm_subs_epu16(q0, q1)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
-                                    _mm_subs_epu16(p1, p2)),
-                       _mm_or_si128(_mm_subs_epu16(q2, q1),
-                                    _mm_subs_epu16(q1, q2)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
-                                    _mm_subs_epu16(p2, p3)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q2),
-                                    _mm_subs_epu16(q2, q3)));
-  mask = _mm_max_epi16(work, mask);
-
-  mask = _mm_subs_epu16(mask, limit);
-  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
+  get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
+                   abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
+                   &hev, &mask);

  // lp filter
-  // highbd_filter4
-  t4 = _mm_set1_epi16(4);
-  t3 = _mm_set1_epi16(3);
-  t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
-  t1 = _mm_set1_epi16(0x1);
-
-  ps1 = _mm_subs_epi16(p1, t80);
-  qs1 = _mm_subs_epi16(q1, t80);
-  ps0 = _mm_subs_epi16(p0, t80);
-  qs0 = _mm_subs_epi16(q0, t80);
-
-  filt = _mm_and_si128(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
-  filt = _mm_and_si128(filt, mask);
-
-  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
-  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
-
-  // Filter1 >> 3
-  filter1 = _mm_srai_epi16(filter1, 0x3);
-  filter2 = _mm_srai_epi16(filter2, 0x3);
-
-  qs0 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
-      t80);
-  ps0 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
-      t80);
-  filt = _mm_adds_epi16(filter1, t1);
-  filt = _mm_srai_epi16(filt, 1);
-  filt = _mm_andnot_si128(hev, filt);
-
-  qs1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
-      t80);
-  ps1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
-      t80);
-  // end highbd_filter4
-  // loopfilter done
+  highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);

  // highbd_flat_mask4
-  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
-                                    _mm_subs_epu16(p0, p2)),
-                       _mm_or_si128(_mm_subs_epu16(p3, p0),
-                                    _mm_subs_epu16(p0, p3)));
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
-                                    _mm_subs_epu16(q0, q2)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q0),
-                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(highbd_abs_diff(p2, p0), highbd_abs_diff(p3, p0));
+  work = _mm_max_epi16(highbd_abs_diff(q2, q0), highbd_abs_diff(q3, q0));
  flat = _mm_max_epi16(work, flat);
  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
  flat = _mm_max_epi16(work, flat);
@@ -192,27 +241,15 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,

  // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
  // but referred to as p0-p4 & q0-q4 in fn)
-  flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
-                                     _mm_subs_epu16(p0, p4)),
-                        _mm_or_si128(_mm_subs_epu16(q4, q0),
-                                     _mm_subs_epu16(q0, q4)));
+  flat2 = _mm_max_epi16(highbd_abs_diff(p4, p0), highbd_abs_diff(q4, q0));

-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
-                                    _mm_subs_epu16(p0, p5)),
-                       _mm_or_si128(_mm_subs_epu16(q5, q0),
-                                    _mm_subs_epu16(q0, q5)));
+  work = _mm_max_epi16(highbd_abs_diff(p5, p0), highbd_abs_diff(q5, q0));
  flat2 = _mm_max_epi16(work, flat2);

-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
-                                    _mm_subs_epu16(p0, p6)),
-                       _mm_or_si128(_mm_subs_epu16(q6, q0),
-                                    _mm_subs_epu16(q0, q6)));
+  work = _mm_max_epi16(highbd_abs_diff(p6, p0), highbd_abs_diff(q6, q0));
  flat2 = _mm_max_epi16(work, flat2);

-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
-                                    _mm_subs_epu16(p0, p7)),
-                       _mm_or_si128(_mm_subs_epu16(q7, q0),
-                                    _mm_subs_epu16(q0, q7)));
+  work = _mm_max_epi16(highbd_abs_diff(p7, p0), highbd_abs_diff(q7, q0));
  flat2 = _mm_max_epi16(work, flat2);

  flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
@@ -225,10 +262,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
  eight = _mm_set1_epi16(8);
  four = _mm_set1_epi16(4);

-  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
-                                _mm_add_epi16(p4, p3));
-  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
-                                _mm_add_epi16(q4, q3));
+  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
+  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));

  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
@@ -237,9 +272,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
  pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
                                                      pixelFilter_q));
-  pixetFilter_p2p1p0 =   _mm_add_epi16(four,
-                                       _mm_add_epi16(pixetFilter_p2p1p0,
-                                                     pixetFilter_q2q1q0));
+  pixetFilter_p2p1p0 = _mm_add_epi16(four, _mm_add_epi16(pixetFilter_p2p1p0,
+                                                         pixetFilter_q2q1q0));
  flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
                                          _mm_add_epi16(p7, p0)), 4);
  flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
@@ -486,6 +520,8 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
  const __m128i blimit = _mm_slli_epi16(
      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
      bd - 8);
@@ -504,74 +540,30 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i workp_a, workp_b, workp_shft;
-
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  const __m128i ps1 = _mm_subs_epi16(p1, t80);
-  const __m128i ps0 = _mm_subs_epi16(p0, t80);
-  const __m128i qs0 = _mm_subs_epi16(q0, t80);
-  const __m128i qs1 = _mm_subs_epi16(q1, t80);
-  __m128i filt;
+  __m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
+  __m128i ps0, ps1, qs0, qs1;
  __m128i work_a;
-  __m128i filter1, filter2;

  (void)count;

  // filter_mask and hev_mask
-  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
-                          _mm_subs_epu16(p0, p1));
-  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
-                          _mm_subs_epu16(q0, q1));
+  abs_p1p0 = highbd_abs_diff(p1, p0);
+  abs_q1q0 = highbd_abs_diff(q1, q0);
+  abs_p0q0 = highbd_abs_diff(p0, q0);
+  abs_p1q1 = highbd_abs_diff(p1, q1);
+  abs_p2p1 = highbd_abs_diff(p2, p1);
+  abs_q2q1 = highbd_abs_diff(q2, q1);
+  abs_p3p2 = highbd_abs_diff(p3, p2);
+  abs_q3q2 = highbd_abs_diff(q3, q2);

-  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
-                          _mm_subs_epu16(q0, p0));
-  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
-                          _mm_subs_epu16(q1, p1));
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
-  mask = _mm_max_epi16(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  mask = _mm_max_epi16(abs_q1q0, mask);
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
-                                    _mm_subs_epu16(p1, p2)),
-                       _mm_or_si128(_mm_subs_epu16(q2, q1),
-                                    _mm_subs_epu16(q1, q2)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
-                                    _mm_subs_epu16(p2, p3)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q2),
-                                    _mm_subs_epu16(q2, q3)));
-  mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
-  mask = _mm_cmpeq_epi16(mask, zero);
+  get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
+                   abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
+                   &hev, &mask);

  // flat_mask4
-  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
-                                    _mm_subs_epu16(p0, p2)),
-                       _mm_or_si128(_mm_subs_epu16(q2, q0),
-                                    _mm_subs_epu16(q0, q2)));
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
-                                    _mm_subs_epu16(p0, p3)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q0),
-                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(highbd_abs_diff(p2, p0), highbd_abs_diff(q2, q0));
+  work = _mm_max_epi16(highbd_abs_diff(p3, p0), highbd_abs_diff(q3, q0));
  flat = _mm_max_epi16(work, flat);
  flat = _mm_max_epi16(abs_p1p0, flat);
  flat = _mm_max_epi16(abs_q1q0, flat);
@@ -579,77 +571,20 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
  flat = _mm_cmpeq_epi16(flat, zero);
  flat = _mm_and_si128(flat, mask);  // flat & mask

-  // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
-  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
-
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
-
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
-
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
-
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
-
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+  // Apply 7-tap filter (result used if flat && mask) c.f. highbd_filter8
+  apply_7tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, &flat_op2[0], &flat_op1[0],
+                    &flat_op0[0], &flat_oq0[0], &flat_oq1[0], &flat_oq2[0]);

  // lp filter
-  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
-  filt = signed_char_clamp_bd_sse2(filt, bd);
-  filt = _mm_and_si128(filt, mask);
+  highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);

-  filter1 = _mm_adds_epi16(filt, t4);
-  filter2 = _mm_adds_epi16(filt, t3);
-
-  // Filter1 >> 3
-  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
-  filter1 = _mm_srai_epi16(filter1, 3);
-
-  // Filter2 >> 3
-  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
-  filter2 = _mm_srai_epi16(filter2, 3);
-
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  filt = _mm_srai_epi16(filt, 1);
-  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-  filt = _mm_andnot_si128(hev, filt);
-
-  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
-  work_a = _mm_adds_epi16(work_a, t80);
  q0 = _mm_load_si128((__m128i *)flat_oq0);
-  work_a = _mm_andnot_si128(flat, work_a);
+  work_a = _mm_andnot_si128(flat, qs0);
  q0 = _mm_and_si128(flat, q0);
  q0 = _mm_or_si128(work_a, q0);

-  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
-  work_a = _mm_adds_epi16(work_a, t80);
  q1 = _mm_load_si128((__m128i *)flat_oq1);
-  work_a = _mm_andnot_si128(flat, work_a);
+  work_a = _mm_andnot_si128(flat, qs1);
  q1 = _mm_and_si128(flat, q1);
  q1 = _mm_or_si128(work_a, q1);

@@ -659,17 +594,13 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
  q2 = _mm_and_si128(flat, q2);
  q2 = _mm_or_si128(work_a, q2);

-  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
-  work_a = _mm_adds_epi16(work_a, t80);
  p0 = _mm_load_si128((__m128i *)flat_op0);
-  work_a = _mm_andnot_si128(flat, work_a);
+  work_a = _mm_andnot_si128(flat, ps0);
  p0 = _mm_and_si128(flat, p0);
  p0 = _mm_or_si128(work_a, p0);

-  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
-  work_a = _mm_adds_epi16(work_a, t80);
  p1 = _mm_load_si128((__m128i *)flat_op1);
-  work_a = _mm_andnot_si128(flat, work_a);
+  work_a = _mm_andnot_si128(flat, ps1);
  p1 = _mm_and_si128(flat, p1);
  p1 = _mm_or_si128(work_a, p1);

@@ -715,7 +646,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
  const __m128i thresh = _mm_slli_epi16(
      _mm_unpacklo_epi8(
          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
-  __m128i mask, hev, flat;
+  __m128i mask, hev;
  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
@@ -724,121 +655,36 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
-                                        _mm_subs_epu16(p0, p1));
-  const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
-                                        _mm_subs_epu16(q0, q1));
-  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i ffff = _mm_cmpeq_epi16(zero, zero);
  const __m128i one = _mm_set1_epi16(1);
-  __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
-                                  _mm_subs_epu16(q0, p0));
-  __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
-                                  _mm_subs_epu16(q1, p1));
-  __m128i work;
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
-  const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
-  const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
-  const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
-  // equivalent to shifting 0x1f left by bitdepth - 8
-  // and setting new bits to 1
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
  // equivalent to shifting 0x7f left by bitdepth - 8
  // and setting new bits to 1
-  const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                     t80);
-  const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                     t80);
-  const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                     t80);
-  const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                     t80);
-  __m128i filt;
-  __m128i work_a;
-  __m128i filter1, filter2;
-
+  __m128i ps1, ps0, qs0, qs1;
+  __m128i abs_p1p0, abs_q1q0, abs_p0q0, abs_p1q1;
+  __m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
  (void)count;

  // filter_mask and hev_mask
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+  abs_p1p0 = highbd_abs_diff(p1, p0);
+  abs_q1q0 = highbd_abs_diff(q1, q0);
+  abs_p0q0 = highbd_abs_diff(p0, q0);
+  abs_p1q1 = highbd_abs_diff(p1, q1);
+  abs_p2p1 = highbd_abs_diff(p2, p1);
+  abs_q2q1 = highbd_abs_diff(q2, q1);
+  abs_p3p2 = highbd_abs_diff(p3, p2);
+  abs_q3q2 = highbd_abs_diff(q3, q2);

-  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
-  mask = _mm_max_epi16(flat, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  // mask |= (abs(q1 - q0) > limit) * -1;
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
-                                    _mm_subs_epu16(p1, p2)),
-                       _mm_or_si128(_mm_subs_epu16(p3, p2),
-                                    _mm_subs_epu16(p2, p3)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
-                                    _mm_subs_epu16(q1, q2)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q2),
-                                    _mm_subs_epu16(q2, q3)));
-  mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
-  mask = _mm_cmpeq_epi16(mask, zero);
+  get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
+                   abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
+                   &hev, &mask);

  // filter4
-  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
-  filt = _mm_and_si128(filt, mask);
+  highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);

-  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
-  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
-
-  // Filter1 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
-  filter1 = _mm_srli_epi16(filter1, 3);
-  work_a = _mm_and_si128(work_a, tffe0);  // sign bits for the values < 0
-  filter1 = _mm_and_si128(filter1, t1f);  // clamp the range
-  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
-
-  // Filter2 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter2);
-  filter2 = _mm_srli_epi16(filter2, 3);
-  work_a = _mm_and_si128(work_a, tffe0);
-  filter2 = _mm_and_si128(filter2, t1f);
-  filter2 = _mm_or_si128(filter2, work_a);
-
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  work_a = _mm_cmpgt_epi16(zero, filt);
-  filt = _mm_srli_epi16(filt, 1);
-  work_a = _mm_and_si128(work_a, tff80);
-  filt = _mm_and_si128(filt, t7f);
-  filt = _mm_or_si128(filt, work_a);
-
-  filt = _mm_andnot_si128(hev, filt);
-
-  q0 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
-  q1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
-  p0 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
-  p1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
-
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), qs0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
 }

 void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
--- a/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
+++ b/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
@@ -19,42 +19,34 @@
    mov         rcx, 0x00000040

    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshuflw     xmm10, xmm7, 0b              ;k0
+    pshuflw     xmm11, xmm7, 01010101b       ;k1
+    pshuflw     xmm12, xmm7, 10101010b       ;k2
+    pshuflw     xmm13, xmm7, 11111111b       ;k3
    psrldq      xmm7, 8
    pshuflw     xmm4, xmm7, 0b              ;k4
    pshuflw     xmm5, xmm7, 01010101b       ;k5
    pshuflw     xmm6, xmm7, 10101010b       ;k6
    pshuflw     xmm7, xmm7, 11111111b       ;k7

-    punpcklwd   xmm0, xmm6
-    punpcklwd   xmm2, xmm5
-    punpcklwd   xmm3, xmm4
-    punpcklwd   xmm1, xmm7
+    punpcklwd   xmm10, xmm6
+    punpcklwd   xmm12, xmm5
+    punpcklwd   xmm13, xmm4
+    punpcklwd   xmm11, xmm7

-    movdqa      k0k6, xmm0
-    movdqa      k2k5, xmm2
-    movdqa      k3k4, xmm3
-    movdqa      k1k7, xmm1
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
+    movq        xmm9, rcx
+    pshufd      xmm9, xmm9, 0

    ;Compute max and min values of a pixel
    mov         rdx, 0x00010001
    movsxd      rcx, DWORD PTR arg(6)      ;bps
-    movq        xmm0, rdx
+    movq        xmm14, rdx
    movq        xmm1, rcx
-    pshufd      xmm0, xmm0, 0b
-    movdqa      xmm2, xmm0
-    psllw       xmm0, xmm1
-    psubw       xmm0, xmm2
-    pxor        xmm1, xmm1
-    movdqa      max, xmm0                  ;max value (for clamping)
-    movdqa      min, xmm1                  ;min value (for clamping)
+    pshufd      xmm14, xmm14, 0b
+    movdqa      xmm2, xmm14
+    psllw       xmm14, xmm1
+    psubw       xmm14, xmm2                 ;max value (for clamping)
+    pxor        xmm8, xmm8                ;min value (for clamping)

 %endm

@@ -64,22 +56,22 @@
    punpcklwd   xmm2, xmm5
    punpcklwd   xmm3, xmm4

-    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
-    pmaddwd     xmm1, k1k7
-    pmaddwd     xmm2, k2k5
-    pmaddwd     xmm3, k3k4
+    pmaddwd     xmm0, xmm10                 ;multiply the filter factors
+    pmaddwd     xmm1, xmm11
+    pmaddwd     xmm2, xmm12
+    pmaddwd     xmm3, xmm13

    paddd       xmm0, xmm1                  ;sum
    paddd       xmm0, xmm2
    paddd       xmm0, xmm3

-    paddd       xmm0, krd                   ;rounding
+    paddd       xmm0, xmm9                  ;rounding
    psrad       xmm0, 7                     ;shift
    packssdw    xmm0, xmm0                  ;pack to word

    ;clamp the values
-    pminsw      xmm0, max
-    pmaxsw      xmm0, min
+    pminsw      xmm0, xmm14
+    pmaxsw      xmm0, xmm8

 %if %1
    movq        xmm1, [rdi]
@@ -95,42 +87,34 @@
    mov         rcx, 0x00000040

    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm10, xmm7, 0b             ;k0
    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshuflw     xmm12, xmm7, 10101010b      ;k2
+    pshuflw     xmm13, xmm7, 11111111b      ;k3
    pshufhw     xmm4, xmm7, 0b              ;k4
    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm11, xmm7, 10101010b      ;k6
    pshufhw     xmm7, xmm7, 11111111b       ;k7
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-    punpcklwd   xmm0, xmm1
-    punpckhwd   xmm6, xmm7
-    punpckhwd   xmm2, xmm5
-    punpckhwd   xmm3, xmm4
+    punpcklqdq  xmm12, xmm12
+    punpcklqdq  xmm13, xmm13
+    punpcklwd   xmm10, xmm1
+    punpckhwd   xmm11, xmm7
+    punpckhwd   xmm12, xmm5
+    punpckhwd   xmm13, xmm4

-    movdqa      k0k1, xmm0                  ;store filter factors on stack
-    movdqa      k6k7, xmm6
-    movdqa      k2k5, xmm2
-    movdqa      k3k4, xmm3
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
+    movq        xmm9, rcx
+    pshufd      xmm9, xmm9, 0              ;rounding

    ;Compute max and min values of a pixel
    mov         rdx, 0x00010001
    movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm0, rdx
+    movq        xmm14, rdx
    movq        xmm1, rcx
-    pshufd      xmm0, xmm0, 0b
-    movdqa      xmm2, xmm0
-    psllw       xmm0, xmm1
-    psubw       xmm0, xmm2
-    pxor        xmm1, xmm1
-    movdqa      max, xmm0                  ;max value (for clamping)
-    movdqa      min, xmm1                  ;min value (for clamping)
+    pshufd      xmm14, xmm14, 0b
+    movdqa      xmm2, xmm14
+    psllw       xmm14, xmm1
+    psubw       xmm14, xmm2                 ;max value (for clamping)
+    pxor        xmm15, xmm15                ;min value (for clamping)
 %endm

 %macro LOAD_VERT_8 1
@@ -146,7 +130,7 @@
 %endm

 %macro HIGH_APPLY_FILTER_8 2
-    movdqu      temp, xmm4
+    movdqa      xmm8, xmm4
    movdqa      xmm4, xmm0
    punpcklwd   xmm0, xmm1
    punpckhwd   xmm4, xmm1
@@ -157,21 +141,21 @@
    punpcklwd   xmm2, xmm5
    punpckhwd   xmm7, xmm5

-    movdqu      xmm5, temp
-    movdqu      temp, xmm4
+    movdqa      xmm5, xmm8
+    movdqa      xmm8, xmm4
    movdqa      xmm4, xmm3
    punpcklwd   xmm3, xmm5
    punpckhwd   xmm4, xmm5
-    movdqu      xmm5, temp
+    movdqa      xmm5, xmm8

-    pmaddwd     xmm0, k0k1
-    pmaddwd     xmm5, k0k1
-    pmaddwd     xmm6, k6k7
-    pmaddwd     xmm1, k6k7
-    pmaddwd     xmm2, k2k5
-    pmaddwd     xmm7, k2k5
-    pmaddwd     xmm3, k3k4
-    pmaddwd     xmm4, k3k4
+    pmaddwd     xmm0, xmm10
+    pmaddwd     xmm5, xmm10
+    pmaddwd     xmm6, xmm11
+    pmaddwd     xmm1, xmm11
+    pmaddwd     xmm2, xmm12
+    pmaddwd     xmm7, xmm12
+    pmaddwd     xmm3, xmm13
+    pmaddwd     xmm4, xmm13

    paddd       xmm0, xmm6
    paddd       xmm0, xmm2
@@ -180,15 +164,15 @@
    paddd       xmm5, xmm7
    paddd       xmm5, xmm4

-    paddd       xmm0, krd                   ;rounding
-    paddd       xmm5, krd
+    paddd       xmm0, xmm9                  ;rounding
+    paddd       xmm5, xmm9
    psrad       xmm0, 7                     ;shift
    psrad       xmm5, 7
    packssdw    xmm0, xmm5                  ;pack back to word

    ;clamp the values
-    pminsw      xmm0, max
-    pmaxsw      xmm0, min
+    pminsw      xmm0, xmm14
+    pmaxsw      xmm0, xmm15

 %if %1
    movdqu      xmm1, [rdi + %2]
@@ -211,22 +195,12 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 14
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
    HIGH_GET_FILTERS_4

    mov         rsi, arg(0)                 ;src_ptr
@@ -256,8 +230,6 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 7
-    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
@@ -281,23 +253,12 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -315,8 +276,6 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
@@ -340,23 +299,12 @@ sym(vp9_highbd_filter_block1d16_v8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -378,8 +326,6 @@ sym(vp9_highbd_filter_block1d16_v8_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
@@ -394,22 +340,12 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 14
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
    HIGH_GET_FILTERS_4

    mov         rsi, arg(0)                 ;src_ptr
@@ -439,8 +375,6 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 7
-    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
@@ -455,23 +389,12 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -488,8 +411,6 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
@@ -504,23 +425,12 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -541,8 +451,6 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
@@ -566,21 +474,11 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 14
    push        rsi
    push        rdi
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
    HIGH_GET_FILTERS_4

    mov         rsi, arg(0)                 ;src_ptr
@@ -592,6 +490,16 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
    lea         rdx, [rdx + rdx]
    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+.load
+    prefetcht0  [rsi - 6]
+    prefetcht0  [rsi + 17]
+    lea         rsi, [rsi + rax]
+    dec         rcx
+    jnz         .load
+
+    mov         rsi, arg(0)
+    movsxd      rcx, DWORD PTR arg(4)
+
 .loop:
    movdqu      xmm0,   [rsi - 6]           ;load src
    movdqu      xmm4,   [rsi + 2]
@@ -616,9 +524,6 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 7
-    pop rsp
-
    ; begin epilog
    pop rdi
    pop rsi
@@ -641,22 +546,11 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -665,6 +559,16 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
    lea         rdx, [rdx + rdx]
    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+.load
+    prefetcht0  [rsi - 6]
+    prefetcht0  [rsi + 23]
+    lea         rsi, [rsi + rax]
+    dec         rcx
+    jnz         .load
+
+    mov         rsi, arg(0)
+    movsxd      rcx, DWORD PTR arg(4)
+
 .loop:
    movdqu      xmm0,   [rsi - 6]           ;load src
    movdqu      xmm1,   [rsi - 4]
@@ -682,9 +586,6 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp
-
    ; begin epilog
    pop rdi
    pop rsi
@@ -707,22 +608,11 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -731,6 +621,16 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
    lea         rdx, [rdx + rdx]
    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+.load
+    prefetcht0  [rsi - 6]
+    prefetcht0  [rsi + 31]
+    lea         rsi, [rsi + rax]
+    dec         rcx
+    jnz         .load
+
+    mov         rsi, arg(0)
+    movsxd      rcx, DWORD PTR arg(4)
+
 .loop:
    movdqu      xmm0,   [rsi - 6]           ;load src
    movdqu      xmm1,   [rsi - 4]
@@ -759,9 +659,6 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp
-
    ; begin epilog
    pop rdi
    pop rsi
@@ -775,21 +672,11 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 14
    push        rsi
    push        rdi
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
    HIGH_GET_FILTERS_4

    mov         rsi, arg(0)                 ;src_ptr
@@ -801,6 +688,16 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
    lea         rdx, [rdx + rdx]
    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+.load
+    prefetcht0  [rsi - 6]
+    prefetcht0  [rsi + 17]
+    lea         rsi, [rsi + rax]
+    dec         rcx
+    jnz         .load
+
+    mov         rsi, arg(0)
+    movsxd      rcx, DWORD PTR arg(4)
+
 .loop:
    movdqu      xmm0,   [rsi - 6]           ;load src
    movdqu      xmm4,   [rsi + 2]
@@ -825,9 +722,6 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 7
-    pop rsp
-
    ; begin epilog
    pop rdi
    pop rsi
@@ -841,22 +735,11 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -865,6 +748,16 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
    lea         rdx, [rdx + rdx]
    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+.load
+    prefetcht0  [rsi - 6]
+    prefetcht0  [rsi + 23]
+    lea         rsi, [rsi + rax]
+    dec         rcx
+    jnz         .load
+
+    mov         rsi, arg(0)
+    movsxd      rcx, DWORD PTR arg(4)
+
 .loop:
    movdqu      xmm0,   [rsi - 6]           ;load src
    movdqu      xmm1,   [rsi - 4]
@@ -882,8 +775,6 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp

    ; begin epilog
    pop rdi
@@ -898,22 +789,11 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
+    SAVE_XMM 15
    push        rsi
    push        rdi
    ; end prolog

-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
    HIGH_GET_FILTERS

    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -922,6 +802,16 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
    lea         rdx, [rdx + rdx]
    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+.load
+    prefetcht0  [rsi - 6]
+    prefetcht0  [rsi + 31]
+    lea         rsi, [rsi + rax]
+    dec         rcx
+    jnz         .load
+
+    mov         rsi, arg(0)
+    movsxd      rcx, DWORD PTR arg(4)
+
 .loop:
    movdqu      xmm0,   [rsi - 6]           ;load src
    movdqu      xmm1,   [rsi - 4]
@@ -950,9 +840,6 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
    dec         rcx
    jnz         .loop

-    add rsp, 16 * 8
-    pop rsp
-
    ; begin epilog
    pop rdi
    pop rsi
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -9,6 +9,7 @@
 */

 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+#include "vp9/common/vp9_idct.h"

 #define RECON_AND_STORE4X4(dest, in_x) \
 {                                                     \
@@ -3985,3 +3986,573 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
    dest += 8 - (stride * 32);
  }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+    __m128i ubounded, retval;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi16(1);
+    const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+    ubounded = _mm_cmpgt_epi16(value, max);
+    retval = _mm_andnot_si128(ubounded, value);
+    ubounded = _mm_and_si128(ubounded, max);
+    retval = _mm_or_si128(retval, ubounded);
+    retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+    return retval;
+}
+
+void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm,  min_input, max_input;
+  int test;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i*)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct4(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(d0,
+           _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(d2,
+           _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vp9_highbd_idct4(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 4),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)),   temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 5),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)),   temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 5),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i   ], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i   ], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 6),
+                                          bd);
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i   ], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i   ], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j)
+        dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
+                                          ROUND_POWER_OF_TWO(temp_out[j], 6),
+                                          bd);
+    }
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -7,6 +7,7 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#include "vp9/common/vp9_common.h"
 #include "vpx_ports/mem.h"

 unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
@@ -17,3 +18,16 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {

  return (sum + 32) >> 6;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -17,6 +17,7 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_dct.h"

 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
@@ -26,7 +27,7 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
  return rv;
 }

-static void fdct4(const tran_low_t *input, tran_low_t *output) {
+void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
  tran_high_t step[4];
  tran_high_t temp1, temp2;

@@ -123,7 +124,7 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
  }
 }

-static void fadst4(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
  tran_high_t x0, x1, x2, x3;
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

@@ -163,13 +164,6 @@ static void fadst4(const tran_low_t *input, tran_low_t *output) {
  output[3] = fdct_round_shift(s3);
 }

-static const transform_2d FHT_4[] = {
-  { fdct4,  fdct4  },  // DCT_DCT  = 0
-  { fadst4, fdct4  },  // ADST_DCT = 1
-  { fdct4,  fadst4 },  // DCT_ADST = 2
-  { fadst4, fadst4 }   // ADST_ADST = 3
-};
-
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
                  int stride, int tx_type) {
  if (tx_type == DCT_DCT) {
@@ -203,7 +197,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
  }
 }

-static void fdct8(const tran_low_t *input, tran_low_t *output) {
+void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
  tran_high_t t0, t1, t2, t3;                  // needs32
  tran_high_t x0, x1, x2, x3;                  // canbe16
@@ -331,7 +325,7 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {

  // Rows
  for (i = 0; i < 8; ++i) {
-    fdct8(&intermediate[i * 8], &final_output[i * 8]);
+    vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
    for (j = 0; j < 8; ++j)
      final_output[j + i * 8] /= 2;
  }
@@ -528,7 +522,7 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
  }
 }

-static void fadst8(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

  tran_high_t x0 = input[7];
@@ -599,13 +593,6 @@ static void fadst8(const tran_low_t *input, tran_low_t *output) {
  output[7] = - x1;
 }

-static const transform_2d FHT_8[] = {
-  { fdct8,  fdct8  },  // DCT_DCT  = 0
-  { fadst8, fdct8  },  // ADST_DCT = 1
-  { fdct8,  fadst8 },  // DCT_ADST = 2
-  { fadst8, fadst8 }   // ADST_ADST = 3
-};
-
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
                  int stride, int tx_type) {
  if (tx_type == DCT_DCT) {
@@ -694,7 +681,7 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
 }

 // Rewrote to use same algorithm as others.
-static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
  tran_high_t step1[8];      // canbe16
  tran_high_t step2[8];      // canbe16
  tran_high_t step3[8];      // canbe16
@@ -835,7 +822,7 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
  out[15] = fdct_round_shift(temp2);
 }

-static void fadst16(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  tran_high_t s9, s10, s11, s12, s13, s14, s15;

@@ -998,13 +985,6 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
  output[15] = - x1;
 }

-static const transform_2d FHT_16[] = {
-  { fdct16,  fdct16  },  // DCT_DCT  = 0
-  { fadst16, fdct16  },  // ADST_DCT = 1
-  { fdct16,  fadst16 },  // DCT_ADST = 2
-  { fadst16, fadst16 }   // ADST_ADST = 3
-};
-
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
                    int stride, int tx_type) {
  if (tx_type == DCT_DCT) {
@@ -1049,7 +1029,7 @@ static INLINE tran_high_t half_round_shift(tran_high_t input) {
  return rv;
 }

-static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
  tran_high_t step[32];
  // Stage 1
  step[0] = input[0] + input[(32 - 1)];
@@ -1392,7 +1372,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = input[j * stride + i] * 4;
-    fdct32(temp_in, temp_out, 0);
+    vp9_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  }
@@ -1402,7 +1382,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = output[j + i * 32];
-    fdct32(temp_in, temp_out, 0);
+    vp9_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
  }
@@ -1420,7 +1400,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = input[j * stride + i] * 4;
-    fdct32(temp_in, temp_out, 0);
+    vp9_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      // TODO(cd): see quality impact of only doing
      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1433,7 +1413,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = output[j + i * 32];
-    fdct32(temp_in, temp_out, 1);
+    vp9_fdct32(temp_in, temp_out, 1);
    for (j = 0; j < 32; ++j)
      out[j + i * 32] = temp_out[j];
  }
--- a/vp9/encoder/vp9_dct.h
+++ b/vp9/encoder/vp9_dct.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_DCT_H_
+#define VP9_ENCODER_VP9_DCT_H_
+
+#include "vp9/common/vp9_idct.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);
+void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                               int stride);
+
+void vp9_fdct4(const tran_low_t *input, tran_low_t *output);
+void vp9_fadst4(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct8(const tran_low_t *input, tran_low_t *output);
+void vp9_fadst8(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);
+void vp9_fadst16(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+
+static const transform_2d FHT_4[] = {
+  { vp9_fdct4,  vp9_fdct4  },  // DCT_DCT  = 0
+  { vp9_fadst4, vp9_fdct4  },  // ADST_DCT = 1
+  { vp9_fdct4,  vp9_fadst4 },  // DCT_ADST = 2
+  { vp9_fadst4, vp9_fadst4 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+  { vp9_fdct8,  vp9_fdct8  },  // DCT_DCT  = 0
+  { vp9_fadst8, vp9_fdct8  },  // ADST_DCT = 1
+  { vp9_fdct8,  vp9_fadst8 },  // DCT_ADST = 2
+  { vp9_fadst8, vp9_fadst8 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+  { vp9_fdct16,  vp9_fdct16  },  // DCT_DCT  = 0
+  { vp9_fadst16, vp9_fdct16  },  // ADST_DCT = 1
+  { vp9_fdct16,  vp9_fadst16 },  // DCT_ADST = 2
+  { vp9_fadst16, vp9_fadst16 }   // ADST_ADST = 3
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_DCT_H_
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -515,8 +515,19 @@ static void choose_partitioning(VP9_COMP *cpi,
        int sum = 0;

        if (x_idx < pixels_wide && y_idx < pixels_high) {
-          int s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-          int d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+          int s_avg, d_avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
+            d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
+          } else {
+            s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
+            d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+          }
+#else
+          s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
+          d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+#endif
          sum = s_avg - d_avg;
          sse = sum * sum;
        }
@@ -3414,9 +3425,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {

 #if CONFIG_VP9_HIGHBITDEPTH
  if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-  else
    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
  x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
                                      vp9_highbd_idct4x4_add;
 #else
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
--- a/vp9/encoder/x86/vp9_dct_impl_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_impl_sse2.c
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/vp9/encoder/x86/vp9_dct_mmx.asm
@@ -62,9 +62,40 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
  psllw           m2,        2
  psllw           m3,        2

+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m0
+  pcmpgtw         m5,             m1
+  movq            m6,             m0
+  movq            m7,             m1
+  punpcklwd       m0,             m4
+  punpcklwd       m1,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m6
+  movq            [outputq + 16], m1
+  movq            [outputq + 24], m7
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m2
+  pcmpgtw         m5,             m3
+  movq            m6,             m2
+  movq            m7,             m3
+  punpcklwd       m2,             m4
+  punpcklwd       m3,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq + 32], m2
+  movq            [outputq + 40], m6
+  movq            [outputq + 48], m3
+  movq            [outputq + 56], m7
+%else
  movq            [outputq],      m0
  movq            [outputq + 8],  m1
  movq            [outputq + 16], m2
  movq            [outputq + 24], m3
+%endif

  RET
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
--- a/vp9/encoder/x86/vp9_dct_sse2.h
+++ b/vp9/encoder/x86/vp9_dct_sse2.h
@@ -0,0 +1,368 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+#define VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32(b, a, b, a)
+
+void vp9_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output,
+                            int stride);
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(__m128i reg0, __m128i reg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
+                              _mm_cmpeq_epi16(reg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
+                              _mm_cmpeq_epi16(reg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(__m128i reg0, __m128i reg1,
+                                   __m128i reg2, __m128i reg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
+                              _mm_cmpeq_epi16(reg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
+                              _mm_cmpeq_epi16(reg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(reg2, max_overflow),
+                              _mm_cmpeq_epi16(reg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(reg3, max_overflow),
+                              _mm_cmpeq_epi16(reg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(__m128i reg0, __m128i reg1,
+                                   __m128i reg2, __m128i reg3, __m128i reg4,
+                                   __m128i reg5, __m128i reg6, __m128i reg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(__m128i reg0, __m128i reg1,
+                                   __m128i reg2, __m128i reg3, __m128i reg4,
+                                   __m128i reg5, __m128i reg6, __m128i reg7,
+                                   __m128i reg8, __m128i reg9, __m128i reg10,
+                                   __m128i reg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  if (!res0)
+    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(__m128i reg0,  __m128i reg1,
+                                    __m128i reg2, __m128i reg3,  __m128i reg4,
+                                    __m128i reg5, __m128i reg6,  __m128i reg7,
+                                    __m128i reg8, __m128i reg9,  __m128i reg10,
+                                    __m128i reg11, __m128i reg12, __m128i reg13,
+                                    __m128i reg14, __m128i reg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+    if (!res1)
+      res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(__m128i reg0,  __m128i reg1,
+                                __m128i reg2, __m128i reg3,  __m128i reg4,
+                                __m128i reg5, __m128i reg6,  __m128i reg7,
+                                __m128i reg8, __m128i reg9,  __m128i reg10,
+                                __m128i reg11, __m128i reg12, __m128i reg13,
+                                __m128i reg14, __m128i reg15, __m128i reg16,
+                                __m128i reg17, __m128i reg18, __m128i reg19,
+                                __m128i reg20, __m128i reg21, __m128i reg22,
+                                __m128i reg23, __m128i reg24, __m128i reg25,
+                                __m128i reg26, __m128i reg27, __m128i reg28,
+                                __m128i reg29, __m128i reg30, __m128i reg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(reg16, reg17, reg18, reg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(reg20, reg21, reg22, reg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(reg24, reg25, reg26, reg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(reg28, reg29, reg30, reg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1,
+                 __m128i reg2, __m128i reg3, const __m128i* zero) {
+  __m128i minus_one = _mm_set1_epi32(-1);
+  // Check for overflows
+  __m128i reg0_shifted = _mm_slli_epi64(reg0, 1);
+  __m128i reg1_shifted = _mm_slli_epi64(reg1, 1);
+  __m128i reg2_shifted = _mm_slli_epi64(reg2, 1);
+  __m128i reg3_shifted = _mm_slli_epi64(reg3, 1);
+  __m128i reg0_top_dwords = _mm_shuffle_epi32(
+                                reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg1_top_dwords = _mm_shuffle_epi32(
+                                reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg2_top_dwords = _mm_shuffle_epi32(
+                                reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg3_top_dwords = _mm_shuffle_epi32(
+                                reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+  int overflow_01 = _mm_movemask_epi8(
+            _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+  int overflow_23 = _mm_movemask_epi8(
+            _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+  return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(__m128i reg0, __m128i reg1,
+                 __m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
+                 __m128i reg6, __m128i reg7, const __m128i* zero) {
+  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(__m128i reg0, __m128i reg1,
+                 __m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
+                 __m128i reg6, __m128i reg7, __m128i reg8, __m128i reg9,
+                 __m128i reg10, __m128i reg11, __m128i reg12, __m128i reg13,
+                 __m128i reg14, __m128i reg15, const __m128i* zero) {
+  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(__m128i reg0, __m128i reg1,
+                 __m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
+                 __m128i reg6, __m128i reg7, __m128i reg8, __m128i reg9,
+                 __m128i reg10, __m128i reg11, __m128i reg12, __m128i reg13,
+                 __m128i reg14, __m128i reg15, __m128i reg16, __m128i reg17,
+                 __m128i reg18, __m128i reg19, __m128i reg20, __m128i reg21,
+                 __m128i reg22, __m128i reg23, __m128i reg24, __m128i reg25,
+                 __m128i reg26, __m128i reg27, __m128i reg28, __m128i reg29,
+                 __m128i reg30, __m128i reg31, const __m128i* zero) {
+  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
+        if (!overflow) {
+          overflow = k_check_epi32_overflow_4(reg16, reg17, reg18, reg19, zero);
+          if (!overflow) {
+            overflow = k_check_epi32_overflow_4(reg20, reg21,
+                                                reg22, reg23, zero);
+            if (!overflow) {
+              overflow = k_check_epi32_overflow_4(reg24, reg25,
+                                                  reg26, reg27, zero);
+              if (!overflow) {
+                overflow = k_check_epi32_overflow_4(reg28, reg29,
+                                                    reg30, reg31, zero);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE void store_output(const __m128i output, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
+  __m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_store_si128((__m128i *)(dst_ptr), output);
+#endif
+}
+
+static INLINE void storeu_output(const __m128i output, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
+  __m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_storeu_si128((__m128i *)(dst_ptr), output);
+#endif
+}
+
+
+static INLINE __m128i mult_round_shift(const __m128i in0, const __m128i in1,
+                                       const __m128i multiplier,
+                                       const __m128i rounding,
+                                       const int shift) {
+  const __m128i u0 = _mm_madd_epi16(in0, multiplier);
+  const __m128i u1 = _mm_madd_epi16(in1, multiplier);
+  const __m128i v0 = _mm_add_epi32(u0, rounding);
+  const __m128i v1 = _mm_add_epi32(u1, rounding);
+  const __m128i w0 = _mm_srai_epi32(v0, shift);
+  const __m128i w1 = _mm_srai_epi32(v1, shift);
+  return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+                                const __m128i in00, const __m128i in01,
+                                const __m128i in02, const __m128i in03,
+                                const __m128i in04, const __m128i in05,
+                                const __m128i in06, const __m128i in07,
+                                const int pass, int16_t* out0_ptr,
+                                tran_low_t* out1_ptr) {
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in00, in01);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in02, in03);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in00, in01);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in02, in03);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in04, in05);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in06, in07);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in04, in05);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in06, in07);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 54 54 55 55 56 56 57 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 21 36
+  // 44 54 64 74 45 55 61 76
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+  if (pass == 0) {
+    _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+  } else {
+    storeu_output(tr2_0, (out1_ptr + 0 * 16));
+    storeu_output(tr2_1, (out1_ptr + 1 * 16));
+    storeu_output(tr2_2, (out1_ptr + 2 * 16));
+    storeu_output(tr2_3, (out1_ptr + 3 * 16));
+    storeu_output(tr2_4, (out1_ptr + 4 * 16));
+    storeu_output(tr2_5, (out1_ptr + 5 * 16));
+    storeu_output(tr2_6, (out1_ptr + 6 * 16));
+    storeu_output(tr2_7, (out1_ptr + 7 * 16));
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_X86_VP9_DCT_SSE2_H_
--- a/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff,
+                               tran_low_t *dqcoeff, intptr_t block_size,
+                               int64_t *ssz, int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bps - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i+=8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+            _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+            _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+            _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+            _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
+            _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i*)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i+j] - dqcoeff[i+j];
+        error +=  diff * diff;
+        sqcoeff += (int64_t)coeff[i+j] * (int64_t)coeff[i+j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -0,0 +1,173 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "vp9/common/vp9_common.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// from vp9_idct.h: typedef int32_t tran_low_t;
+void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      int zbin_oq_value, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value),
+                           (int)(zbin_ptr[1] + zbin_oq_value),
+                           (int)(zbin_ptr[1] + zbin_oq_value),
+                           (int)(zbin_ptr[0] + zbin_oq_value));
+  zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value));
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = ((int)count / 4) - 1; i >= 0; i--) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (test == 0xffff)
+        non_zero_regs--;
+      else
+        break;
+    }
+
+    // Quantization pass:
+    for (i = 0; i < non_zero_regs; i++) {
+      __m128i coeffs, coeffs_sign, tmp1, tmp2;
+      int test;
+      int abs_coeff[4];
+      int coeff_sign[4];
+
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      coeffs_sign = _mm_srai_epi32(coeffs, 31);
+      coeffs = _mm_sub_epi32(
+            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+      tmp1 = _mm_or_si128(tmp1, tmp2);
+      test = _mm_movemask_epi8(tmp1);
+      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
+      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+
+      for (j = 0; j < 4; j++) {
+        if (test & (1 << (4*j))) {
+          int k = 4 * i + j;
+          int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
+                              INT32_MIN, INT32_MAX);
+          tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
+                    quant_shift_ptr[k != 0]) >> 16;  // quantization
+          qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
+          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+          if (tmp)
+            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+        }
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+
+void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
+                                 intptr_t n_coeffs, int skip_block,
+                                 const int16_t *zbin_ptr,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 const int16_t *quant_shift_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr,
+                                 int zbin_oq_value, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value),
+                           (zbin1_tmp + zbin_oq_value),
+                           (zbin1_tmp + zbin_oq_value),
+                           (zbin0_tmp + zbin_oq_value));
+  zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value));
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs / 4; i++) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (!(test & 0xf))
+        idx_arr[idx++] = i*4;
+      if (!(test & 0xf0))
+        idx_arr[idx++] = i*4 + 1;
+      if (!(test & 0xf00))
+        idx_arr[idx++] = i*4 + 2;
+      if (!(test & 0xf000))
+        idx_arr[idx++] = i*4 + 3;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = idx_arr[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff +
+                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                          INT32_MIN, INT32_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
--- a/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm
+++ b/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm
@@ -0,0 +1,284 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         unsigned int res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4, one
+%else
+cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4, one
+%endif
+
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  mov                 oned, 0x00010001
+  movd                  m1, oned
+  pshufd                m1, m1, 0x0
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4
--- a/vp9/encoder/x86/vp9_highbd_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_highbd_sad_sse2.asm
@@ -0,0 +1,363 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+src_strideq*2]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*4]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*4]
+  por                   m3, m5
+  mova                  m5, [srcq+src_stride3q*2]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_stride3q*2]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
--- a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
--- a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
@@ -0,0 +1,313 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
+sym(vp9_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
+sym(vp9_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@@ -0,0 +1,613 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef unsigned int (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            unsigned int *sse, int *sum);
+
+unsigned int vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        unsigned int *sse, int *sum);
+
+unsigned int vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          unsigned int *sse, int *sum);
+
+static void highbd_variance_sse2(const uint16_t *src, int src_stride,
+                                 const uint16_t *ref, int ref_stride,
+                                 int w, int h, unsigned int *sse, int *sum,
+                                 high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   int w, int h, unsigned int *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, unsigned int *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                  const uint8_t *ref8, int ref_stride, \
+                                  unsigned int *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                   sse, sum); \
+} \
+\
+void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                     const uint8_t *ref8, int ref_stride, \
+                                     unsigned int *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                   sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                     const uint8_t *ref8, int ref_stride, \
+                                     unsigned int *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                   sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+unsigned int vp9_highbd_variance##w##x##h##_sse2( \
+                                    const uint8_t *src8, int src_stride, \
+                                    const uint8_t *ref8, int ref_stride, \
+                                    unsigned int *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                vp9_highbd_calc##block_size##x##block_size##var_sse2, \
+                block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+unsigned int vp9_highbd_10_variance##w##x##h##_sse2( \
+                                    const uint8_t *src8, int src_stride, \
+                                    const uint8_t *ref8, int ref_stride, \
+                                    unsigned int *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                vp9_highbd_calc##block_size##x##block_size##var_sse2, \
+                block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+unsigned int vp9_highbd_12_variance##w##x##h##_sse2( \
+                                    const uint8_t *src8, int src_stride, \
+                                    const uint8_t *ref8, int ref_stride, \
+                                    unsigned int *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                vp9_highbd_calc##block_size##x##block_size##var_sse2, \
+                block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16,  9);
+VAR_FN(16, 32, 16,  9);
+VAR_FN(16, 16, 16,  8);
+VAR_FN(16,  8,  8,  7);
+VAR_FN(8,  16,  8,  7);
+VAR_FN(8,   8,  8,  6);
+
+#undef VAR_FN
+
+unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                    const uint8_t *ref8, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+                                               ptrdiff_t src_stride, \
+                                               int x_offset, int y_offset, \
+                                               const uint16_t *dst, \
+                                               ptrdiff_t dst_stride, \
+                                               int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int \
+      vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst8, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, h, \
+                                                       &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+unsigned int vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+                                                     const uint8_t *src8, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst8, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, \
+                                                       h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 48, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+unsigned int vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+                                                     const uint8_t *src8, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst8, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  int start_row; \
+  unsigned int sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    unsigned int sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, dst + (start_row * dst_stride), \
+                dst_stride, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+                  src + 16 + (start_row * src_stride), src_stride, \
+                  x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+                  dst_stride, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+                    src + 32 + (start_row * src_stride), src_stride, \
+                    x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+                    dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+                    src + 48 + (start_row * src_stride), src_stride, \
+                    x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+                    dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      }\
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (int64_t)); \
+FN(8,  16,  8, 3, 4, opt1, (int64_t)); \
+FN(8,   8,  8, 3, 3, opt1, (int64_t)); \
+FN(8,   4,  8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+                                                   ptrdiff_t src_stride, \
+                                                   int x_offset, int y_offset, \
+                                                   const uint16_t *dst, \
+                                                   ptrdiff_t dst_stride, \
+                                                   const uint16_t *sec, \
+                                                   ptrdiff_t sec_stride, \
+                                                   int height, \
+                                                   unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
+                                            const uint8_t *src8, \
+                                            int src_stride, \
+                                            int x_offset, \
+                                            int y_offset, \
+                                            const uint8_t *dst8, \
+                                            int dst_stride, \
+                                            unsigned int *sse_ptr, \
+                                            const uint8_t *sec8) { \
+  unsigned int sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+unsigned int vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+                                            const uint8_t *src8, \
+                                            int src_stride, \
+                                            int x_offset, \
+                                            int y_offset, \
+                                            const uint8_t *dst8, \
+                                            int dst_stride, \
+                                            unsigned int *sse_ptr, \
+                                            const uint8_t *sec8) { \
+  unsigned int sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+unsigned int vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+                                            const uint8_t *src8, \
+                                            int src_stride, \
+                                            int x_offset, \
+                                            int y_offset, \
+                                            const uint8_t *dst8, \
+                                            int dst_stride, \
+                                            unsigned int *sse_ptr, \
+                                            const uint8_t *sec8) { \
+  int start_row; \
+  unsigned int sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    unsigned int sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, x_offset, \
+                y_offset, dst + (start_row * dst_stride), dst_stride, \
+                sec + (start_row * w), w, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 16 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 16 + (start_row * dst_stride), dst_stride, \
+                sec + 16 + (start_row * w), w, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 32 + (start_row * dst_stride), dst_stride, \
+                sec + 32 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 48 + (start_row * dst_stride), dst_stride, \
+                sec + 48 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      } \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (int64_t)); \
+FN(8,  16,  8, 4, 3, opt1, (int64_t)); \
+FN(8,   8,  8, 3, 3, opt1, (int64_t)); \
+FN(8,   4,  8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -24,6 +24,7 @@ VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
 VP9_CX_SRCS-yes += encoder/vp9_cost.h
 VP9_CX_SRCS-yes += encoder/vp9_cost.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
 VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
@@ -101,6 +102,12 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+endif

 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
@@ -109,6 +116,11 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
+endif
 endif

 ifeq ($(ARCH_X86_64),yes)
@@ -120,7 +132,9 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c

 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -18,58 +18,56 @@
 #include <stdlib.h>
 static void once(void (*func)(void))
 {
-    static CRITICAL_SECTION *lock;
-    static LONG waiters;
-    static int done;
-    void *lock_ptr = &lock;
+  static CRITICAL_SECTION *lock;
+  static LONG waiters;
+  static int done;
+  void *lock_ptr = &lock;

-    /* If the initialization is complete, return early. This isn't just an
-     * optimization, it prevents races on the destruction of the global
-     * lock.
-     */
-    if(done)
-        return;
+  /* If the initialization is complete, return early. This isn't just an
+   * optimization, it prevents races on the destruction of the global
+   * lock.
+   */
+  if (done)
+    return;

-    InterlockedIncrement(&waiters);
+  InterlockedIncrement(&waiters);

-    /* Get a lock. We create one and try to make it the one-true-lock,
-     * throwing it away if we lost the race.
-     */
+  /* Get a lock. We create one and try to make it the one-true-lock,
+   * throwing it away if we lost the race.
+   */

+  {
+    /* Scope to protect access to new_lock */
+    CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
+    InitializeCriticalSection(new_lock);
+    if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
    {
-        /* Scope to protect access to new_lock */
-        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
-        InitializeCriticalSection(new_lock);
-        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
-        {
-            DeleteCriticalSection(new_lock);
-            free(new_lock);
-        }
+      DeleteCriticalSection(new_lock);
+      free(new_lock);
    }
+  }

-    /* At this point, we have a lock that can be synchronized on. We don't
-     * care which thread actually performed the allocation.
-     */
+  /* At this point, we have a lock that can be synchronized on. We don't
+   * care which thread actually performed the allocation.
+   */

-    EnterCriticalSection(lock);
+  EnterCriticalSection(lock);

-    if (!done)
-    {
-        func();
-        done = 1;
-    }
+  if (!done) {
+    func();
+    done = 1;
+  }

-    LeaveCriticalSection(lock);
+  LeaveCriticalSection(lock);

-    /* Last one out should free resources. The destructed objects are
-     * protected by checking if(done) above.
-     */
-    if(!InterlockedDecrement(&waiters))
-    {
-        DeleteCriticalSection(lock);
-        free(lock);
-        lock = NULL;
-    }
+  /* Last one out should free resources. The destructed objects are
+   * protected by checking if(done) above.
+   */
+  if (!InterlockedDecrement(&waiters)) {
+    DeleteCriticalSection(lock);
+    free(lock);
+    lock = NULL;
+  }
 }


@@ -78,25 +76,24 @@ static void once(void (*func)(void))
 #include <os2.h>
 static void once(void (*func)(void))
 {
-    static int done;
+  static int done;

-    /* If the initialization is complete, return early. */
-    if(done)
-        return;
+  /* If the initialization is complete, return early. */
+  if (done)
+    return;

-    /* Causes all other threads in the process to block themselves
-     * and give up their time slice.
-     */
-    DosEnterCritSec();
+  /* Causes all other threads in the process to block themselves
+   * and give up their time slice.
+   */
+  DosEnterCritSec();

-    if (!done)
-    {
-        func();
-        done = 1;
-    }
+  if (!done) {
+    func();
+    done = 1;
+  }

-    /* Restores normal thread dispatching for the current process. */
-    DosExitCritSec();
+  /* Restores normal thread dispatching for the current process. */
+  DosExitCritSec();
 }


@@ -104,8 +101,8 @@ static void once(void (*func)(void))
 #include <pthread.h>
 static void once(void (*func)(void))
 {
-    static pthread_once_t lock = PTHREAD_ONCE_INIT;
-    pthread_once(&lock, func);
+  static pthread_once_t lock = PTHREAD_ONCE_INIT;
+  pthread_once(&lock, func);
 }


@@ -117,13 +114,12 @@ static void once(void (*func)(void))

 static void once(void (*func)(void))
 {
-    static int done;
+  static int done;

-    if(!done)
-    {
-        func();
-        done = 1;
-    }
+  if (!done) {
+    func();
+    done = 1;
+  }
 }
 #endif