Merge "Added high bitdepth sse2 transform functions"

2014-12-02 12:29:21 -08:00 · 2014-12-02 12:29:21 -08:00 · dcb29c1406
commit dcb29c1406
parent bf758b6afa 7e40a55ef9
14 changed files with 3371 additions and 1457 deletions
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@ -264,6 +264,8 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

 typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
+    Idct16x16Param;

 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                   int /*tx_type*/) {
@ -311,7 +313,33 @@ void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
 }
-#endif
+
+void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_c(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+}
+
+void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 class Trans16x16TestBase {
 public:
@ -518,7 +546,7 @@ class Trans16x16TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

    for (int i = 0; i < count_test_block; ++i) {
      double out_r[kNumCoeffs];
@ -534,13 +562,13 @@ class Trans16x16TestBase {
          src16[j] = rnd.Rand16() & mask_;
          dst16[j] = rnd.Rand16() & mask_;
          in[j] = src16[j] - dst16[j];
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
        }
      }

      reference_16x16_dct_2d(in, out_r);
      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = round(out_r[j]);
+        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));

      if (bit_depth_ == VPX_BITS_8) {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
@ -548,7 +576,7 @@ class Trans16x16TestBase {
      } else {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
                                            16));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
      }

      for (int j = 0; j < kNumCoeffs; ++j) {
@ -557,7 +585,7 @@ class Trans16x16TestBase {
            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
        const uint32_t diff = dst[j] - src[j];
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
        const uint32_t error = diff * diff;
        EXPECT_GE(1u, error)
            << "Error: 16x16 IDCT has error " << error
@ -565,6 +593,64 @@ class Trans16x16TestBase {
      }
    }
  }
+
+  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                 pitch_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 16x16 IDCT Comparison has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
  int pitch_;
  int tx_type_;
  vpx_bit_depth_t bit_depth_;
@ -590,10 +676,10 @@ class Trans16x16DCT
    mask_ = (1 << bit_depth_) - 1;
 #if CONFIG_VP9_HIGHBITDEPTH
    switch (bit_depth_) {
-      case 10:
+      case VPX_BITS_10:
        inv_txfm_ref = idct16x16_10_ref;
        break;
-      case 12:
+      case VPX_BITS_12:
        inv_txfm_ref = idct16x16_12_ref;
        break;
      default:
@ -703,6 +789,37 @@ TEST_P(Trans16x16HT, QuantCheck) {
  RunQuantCheck(429, 729);
 }

+class InvTrans16x16DCT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<Idct16x16Param> {
+ public:
+  virtual ~InvTrans16x16DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    pitch_ = 16;
+    mask_ = (1 << bit_depth_) - 1;
+}
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans16x16DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
@ -717,7 +834,7 @@ INSTANTIATE_TEST_CASE_P(
    C, Trans16x16DCT,
    ::testing::Values(
        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -743,7 +860,7 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -770,7 +887,52 @@ INSTANTIATE_TEST_CASE_P(
                   VPX_BITS_8),
        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
                   VPX_BITS_8)));
-#endif
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct16x16_sse2,
+                   &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct16x16_c,
+                   &idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct16x16_sse2,
+                   &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct16x16_c,
+                   &idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct16x16_sse2,
+                   &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
+                   VPX_BITS_8)));
+// Optimizations take effect at a threshold of 3155, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans16x16DCT,
+    ::testing::Values(
+        make_tuple(&idct16x16_10_add_10_c,
+                   &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10,
+                   &idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10_add_12_c,
+                   &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
+        make_tuple(&idct16x16_12,
+                   &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -778,5 +940,5 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::Values(
        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0,
                   VPX_BITS_8)));
-#endif
+#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@ -79,6 +79,10 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
    Trans32x32Param;

 #if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 8);
+}
+
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 10);
 }
@ -86,7 +90,7 @@ void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
 void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 12);
 }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
 public:
@ -114,7 +118,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  uint32_t max_error = 0;
  int64_t total_error = 0;
-  const int count_test_block = 1000;
+  const int count_test_block = 10000;
  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
@ -127,7 +131,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-mask_, mask_].
    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (bit_depth_ == 8) {
+      if (bit_depth_ == VPX_BITS_8) {
        src[j] = rnd.Rand8();
        dst[j] = rnd.Rand8();
        test_input_block[j] = src[j] - dst[j];
@ -282,7 +286,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {

    reference_32x32_dct_2d(in, out_r);
    for (int j = 0; j < kNumCoeffs; ++j)
-      coeff[j] = round(out_r[j]);
+      coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
    if (bit_depth_ == VPX_BITS_8) {
      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
 #if CONFIG_VP9_HIGHBITDEPTH
@ -331,7 +335,7 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
        make_tuple(&vp9_fdct32x32_rd_c,
                   &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -341,7 +345,7 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct32x32_1024_add_neon, 0, VPX_BITS_8),
        make_tuple(&vp9_fdct32x32_rd_c,
                   &vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -351,7 +355,23 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
        make_tuple(&vp9_fdct32x32_rd_sse2,
                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
-#endif
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
+                   VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
+                   VPX_BITS_12),
+        make_tuple(&vp9_fdct32x32_sse2, &vp9_idct32x32_1024_add_c, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_sse2, &vp9_idct32x32_1024_add_c, 1,
+                   VPX_BITS_8)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -361,5 +381,5 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
        make_tuple(&vp9_fdct32x32_rd_avx2,
                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
-#endif
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@ -75,7 +75,17 @@ void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
 void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
  vp9_highbd_iwht4x4_16_add_c(in, out, stride, 12);
 }
-#endif
+
+#if HAVE_SSE2
+void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
+}
+
+void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 class Trans4x4TestBase {
 public:
@ -416,7 +426,7 @@ INSTANTIATE_TEST_CASE_P(
    C, Trans4x4DCT,
    ::testing::Values(
        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -442,7 +452,7 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -456,7 +466,7 @@ INSTANTIATE_TEST_CASE_P(
    C, Trans4x4WHT,
    ::testing::Values(
        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -471,7 +481,7 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
    !CONFIG_EMULATE_HARDWARE
@ -494,6 +504,33 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
-#endif
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct4x4_sse2,      &vp9_idct4x4_16_add_c, 0,
+                   VPX_BITS_8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@ -71,6 +71,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

 typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;

 void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
  vp9_fdct8x8_c(in, out, stride);
@ -96,7 +97,33 @@ void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
 }
-#endif
+
+void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_c(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+}
+
+void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 class FwdTrans8x8TestBase {
 public:
@ -146,9 +173,10 @@ class FwdTrans8x8TestBase {
    memset(count_sign_block, 0, sizeof(count_sign_block));

    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-15, 15].
+      // Initialize a test block with input range [-mask_ / 16, mask_ / 16].
      for (int j = 0; j < 64; ++j)
-        test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+        test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
+                              ((rnd.Rand16() & mask_) >> 4);
      ASM_REGISTER_STATE_CHECK(
          RunFwdTxfm(test_input_block, test_output_block, pitch_));

@ -188,7 +216,7 @@ class FwdTrans8x8TestBase {
 #endif

    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < 64; ++j) {
        if (bit_depth_ == VPX_BITS_8) {
          src[j] = rnd.Rand8();
@ -427,6 +455,63 @@ class FwdTrans8x8TestBase {
      }
    }
  }
+
+void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 12;
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
+    const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
  int pitch_;
  int tx_type_;
  FhtFunc fwd_txfm_ref;
@ -526,6 +611,38 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) {
  RunExtremalCheck();
 }

+class InvTrans8x8DCT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<Idct8x8Param> {
+ public:
+  virtual ~InvTrans8x8DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_ = GET_PARAM(2);
+    pitch_ = 8;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans8x8DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
@ -540,7 +657,7 @@ INSTANTIATE_TEST_CASE_P(
    C, FwdTrans8x8DCT,
    ::testing::Values(
        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -566,7 +683,7 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -581,7 +698,7 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
-#endif
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -596,7 +713,45 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
-#endif
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fdct8x8_c,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct8x8_c,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+
+// Optimizations take effect at a threshold of 6201, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&idct8x8_10_add_10_c,
+                   &idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10,
+                   &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10_add_12_c,
+                   &idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
+        make_tuple(&idct8x8_12,
+                   &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
    !CONFIG_EMULATE_HARDWARE
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -112,9 +112,6 @@ typedef struct {
  // Common for both INTER and INTRA blocks
  BLOCK_SIZE sb_type;
  PREDICTION_MODE mode;
-#if CONFIG_FILTERINTRA
-  int filterbit, uv_filterbit;
-#endif
  TX_SIZE tx_size;
  int8_t skip;
  int8_t segment_id;
@ -130,17 +127,11 @@ typedef struct {
  uint8_t mode_context[MAX_REF_FRAMES];
  INTERP_FILTER interp_filter;

-#if CONFIG_EXT_TX
-  EXT_TX_TYPE ext_txfrm;
-#endif
 } MB_MODE_INFO;

 typedef struct MODE_INFO {
  struct MODE_INFO *src_mi;
  MB_MODE_INFO mbmi;
-#if CONFIG_FILTERINTRA
-  int b_filter_info[4];
-#endif
  b_mode_info bmi[4];
 } MODE_INFO;

@ -149,17 +140,6 @@ static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
                                      : mi->mbmi.mode;
 }

-#if CONFIG_FILTERINTRA
-static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
-  (void)mode;
-  return 1;
-}
-
-static INLINE int is_filter_enabled(TX_SIZE txsize) {
-  return (txsize < TX_SIZES);
-}
-#endif
-
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
  return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@ -257,33 +237,13 @@ static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,

 extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];

-#if CONFIG_EXT_TX
-static TX_TYPE ext_tx_to_txtype(EXT_TX_TYPE ext_tx) {
-  switch (ext_tx) {
-    case NORM:
-    default:
-      return DCT_DCT;
-    case ALT:
-      return ADST_ADST;
-  }
-}
-#endif
-
 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
                                  const MACROBLOCKD *xd) {
  const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;

-#if CONFIG_EXT_TX
-  if (plane_type != PLANE_TYPE_Y || xd->lossless)
-      return DCT_DCT;
-
-  if (is_inter_block(mbmi)) {
-    return ext_tx_to_txtype(mbmi->ext_txfrm);
-  }
-#else
  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
    return DCT_DCT;
-#endif
+
  return intra_mode_to_tx_type_lookup[mbmi->mode];
 }

@ -291,17 +251,8 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd, int ib) {
  const MODE_INFO *const mi = xd->mi[0].src_mi;

-#if CONFIG_EXT_TX
-  if (plane_type != PLANE_TYPE_Y || xd->lossless)
-      return DCT_DCT;
-
-  if (is_inter_block(&mi->mbmi)) {
-    return ext_tx_to_txtype(mi->mbmi.ext_txfrm);
-  }
-#else
  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
    return DCT_DCT;
-#endif

  return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
 }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -750,27 +750,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct4x4_1_add/;

-  add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct4x4_16_add/;
-
  add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct8x8_1_add/;

-  add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct8x8_64_add/;
-
-  add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct8x8_10_add/;
-
  add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct16x16_1_add/;

-  add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct16x16_256_add/;
-
-  add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct16x16_10_add/;
-
  add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct32x32_1024_add/;

@ -796,6 +781,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_iwht4x4_16_add/;
+
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+
+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct4x4_16_add/;
+
+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_64_add/;
+
+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_10_add/;
+
+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_256_add/;
+
+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_10_add/;
+
+  } else {
+
+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct4x4_16_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_64_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_10_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_256_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_10_add sse2/;
+  }
 }

 #
@ -1184,43 +1205,43 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4/;
+  specialize qw/vp9_fht4x4 sse2/;

  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8/;
+  specialize qw/vp9_fht8x8 sse2/;

  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16/;
+  specialize qw/vp9_fht16x16 sse2/;

  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/;
+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";

  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4_1/;
+  specialize qw/vp9_fdct4x4_1 sse2/;

  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4/;
+  specialize qw/vp9_fdct4x4 sse2/;

  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8_1/;
+  specialize qw/vp9_fdct8x8_1 sse2/;

  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8/;
+  specialize qw/vp9_fdct8x8 sse2/;

  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16_1/;
+  specialize qw/vp9_fdct16x16_1 sse2/;

  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16/;
+  specialize qw/vp9_fdct16x16 sse2/;

  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_1/;
+  specialize qw/vp9_fdct32x32_1 sse2/;

  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32/;
+  specialize qw/vp9_fdct32x32 sse2/;

  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_rd/;
+  specialize qw/vp9_fdct32x32_rd sse2/;
 } else {
  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  specialize qw/vp9_fht4x4 sse2/;
@ -1882,40 +1903,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  # fdct functions
  add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht4x4/;
+  specialize qw/vp9_highbd_fht4x4 sse2/;

  add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht8x8/;
+  specialize qw/vp9_highbd_fht8x8 sse2/;

  add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht16x16/;
+  specialize qw/vp9_highbd_fht16x16 sse2/;

  add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fwht4x4/;

  add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct4x4/;
+  specialize qw/vp9_highbd_fdct4x4 sse2/;

  add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct8x8_1/;

  add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct8x8/;
+  specialize qw/vp9_highbd_fdct8x8 sse2/;

  add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct16x16_1/;

  add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct16x16/;
+  specialize qw/vp9_highbd_fdct16x16 sse2/;

  add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct32x32_1/;

  add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct32x32/;
+  specialize qw/vp9_highbd_fdct32x32 sse2/;

  add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct32x32_rd/;
+  specialize qw/vp9_highbd_fdct32x32_rd sse2/;

  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
  specialize qw/vp9_highbd_temporal_filter_apply/;
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@ -9,6 +9,7 @@
 */

 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+#include "vp9/common/vp9_idct.h"

 #define RECON_AND_STORE4X4(dest, in_x) \
 {                                                     \
@ -3985,3 +3986,573 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
    dest += 8 - (stride * 32);
  }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+    __m128i ubounded, retval;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi16(1);
+    const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+    ubounded = _mm_cmpgt_epi16(value, max);
+    retval = _mm_andnot_si128(ubounded, value);
+    ubounded = _mm_and_si128(ubounded, max);
+    retval = _mm_or_si128(retval, ubounded);
+    retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+    return retval;
+}
+
+void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm,  min_input, max_input;
+  int test;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i*)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct4(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(d0,
+           _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(d2,
+           _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vp9_highbd_idct4(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)),   temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 4; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)),   temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i   ], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i   ], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i   ], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i   ], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
+        _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -3535,9 +3535,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {

 #if CONFIG_VP9_HIGHBITDEPTH
  if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-  else
    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
  x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
                                      vp9_highbd_idct4x4_add;
 #else
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
--- a/vp9/encoder/x86/vp9_dct_impl_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_impl_sse2.c
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/vp9/encoder/x86/vp9_dct_mmx.asm
@ -62,9 +62,40 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
  psllw           m2,        2
  psllw           m3,        2

+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m0
+  pcmpgtw         m5,             m1
+  movq            m6,             m0
+  movq            m7,             m1
+  punpcklwd       m0,             m4
+  punpcklwd       m1,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m6
+  movq            [outputq + 16], m1
+  movq            [outputq + 24], m7
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m2
+  pcmpgtw         m5,             m3
+  movq            m6,             m2
+  movq            m7,             m3
+  punpcklwd       m2,             m4
+  punpcklwd       m3,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq + 32], m2
+  movq            [outputq + 40], m6
+  movq            [outputq + 48], m3
+  movq            [outputq + 56], m7
+%else
  movq            [outputq],      m0
  movq            [outputq + 8],  m1
  movq            [outputq + 16], m2
  movq            [outputq + 24], m3
+%endif

  RET
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
--- a/vp9/encoder/x86/vp9_dct_sse2.h
+++ b/vp9/encoder/x86/vp9_dct_sse2.h
@ -0,0 +1,373 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+#define VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+void vp9_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output,
+                               int stride);
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(__m128i reg0, __m128i reg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
+                              _mm_cmpeq_epi16(reg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
+                              _mm_cmpeq_epi16(reg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(__m128i reg0, __m128i reg1,
+                                          __m128i reg2, __m128i reg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
+                              _mm_cmpeq_epi16(reg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
+                              _mm_cmpeq_epi16(reg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(reg2, max_overflow),
+                              _mm_cmpeq_epi16(reg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(reg3, max_overflow),
+                              _mm_cmpeq_epi16(reg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(__m128i reg0, __m128i reg1,
+                                          __m128i reg2, __m128i reg3,
+                                          __m128i reg4, __m128i reg5,
+                                          __m128i reg6, __m128i reg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(__m128i reg0, __m128i reg1,
+                                   __m128i reg2, __m128i reg3, __m128i reg4,
+                                   __m128i reg5, __m128i reg6, __m128i reg7,
+                                   __m128i reg8, __m128i reg9, __m128i reg10,
+                                   __m128i reg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  if (!res0)
+    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(__m128i reg0,  __m128i reg1,
+                                    __m128i reg2, __m128i reg3,  __m128i reg4,
+                                    __m128i reg5, __m128i reg6,  __m128i reg7,
+                                    __m128i reg8, __m128i reg9,  __m128i reg10,
+                                    __m128i reg11, __m128i reg12, __m128i reg13,
+                                    __m128i reg14, __m128i reg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+    if (!res1)
+      res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(__m128i reg0,  __m128i reg1,
+                                __m128i reg2, __m128i reg3,  __m128i reg4,
+                                __m128i reg5, __m128i reg6,  __m128i reg7,
+                                __m128i reg8, __m128i reg9,  __m128i reg10,
+                                __m128i reg11, __m128i reg12, __m128i reg13,
+                                __m128i reg14, __m128i reg15, __m128i reg16,
+                                __m128i reg17, __m128i reg18, __m128i reg19,
+                                __m128i reg20, __m128i reg21, __m128i reg22,
+                                __m128i reg23, __m128i reg24, __m128i reg25,
+                                __m128i reg26, __m128i reg27, __m128i reg28,
+                                __m128i reg29, __m128i reg30, __m128i reg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
+  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(reg16, reg17, reg18, reg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(reg20, reg21, reg22, reg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(reg24, reg25, reg26, reg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(reg28, reg29, reg30, reg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1,
+                 __m128i reg2, __m128i reg3, const __m128i* zero) {
+  __m128i minus_one = _mm_set1_epi32(-1);
+  // Check for overflows
+  __m128i reg0_shifted = _mm_slli_epi64(reg0, 1);
+  __m128i reg1_shifted = _mm_slli_epi64(reg1, 1);
+  __m128i reg2_shifted = _mm_slli_epi64(reg2, 1);
+  __m128i reg3_shifted = _mm_slli_epi64(reg3, 1);
+  __m128i reg0_top_dwords = _mm_shuffle_epi32(
+      reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg1_top_dwords = _mm_shuffle_epi32(
+      reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg2_top_dwords = _mm_shuffle_epi32(
+      reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg3_top_dwords = _mm_shuffle_epi32(
+      reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+  int overflow_01 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+  int overflow_23 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+  return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(__m128i reg0, __m128i reg1,
+                                           __m128i reg2, __m128i reg3,
+                                           __m128i reg4, __m128i reg5,
+                                           __m128i reg6, __m128i reg7,
+                                           const __m128i* zero) {
+  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(
+    __m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3,
+    __m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7,
+    __m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11,
+    __m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15,
+    const __m128i* zero) {
+  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(
+    __m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3,
+    __m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7,
+    __m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11,
+    __m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15,
+    __m128i reg16, __m128i reg17, __m128i reg18, __m128i reg19,
+    __m128i reg20, __m128i reg21, __m128i reg22, __m128i reg23,
+    __m128i reg24, __m128i reg25, __m128i reg26, __m128i reg27,
+    __m128i reg28, __m128i reg29, __m128i reg30, __m128i reg31,
+    const __m128i* zero) {
+  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
+        if (!overflow) {
+          overflow = k_check_epi32_overflow_4(reg16, reg17, reg18, reg19, zero);
+          if (!overflow) {
+            overflow = k_check_epi32_overflow_4(reg20, reg21,
+                                                reg22, reg23, zero);
+            if (!overflow) {
+              overflow = k_check_epi32_overflow_4(reg24, reg25,
+                                                  reg26, reg27, zero);
+              if (!overflow) {
+                overflow = k_check_epi32_overflow_4(reg28, reg29,
+                                                    reg30, reg31, zero);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE void store_output(const __m128i output, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
+  __m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_store_si128((__m128i *)(dst_ptr), output);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE void storeu_output(const __m128i output, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
+  __m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_storeu_si128((__m128i *)(dst_ptr), output);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+
+static INLINE __m128i mult_round_shift(const __m128i in0, const __m128i in1,
+                                       const __m128i multiplier,
+                                       const __m128i rounding,
+                                       const int shift) {
+  const __m128i u0 = _mm_madd_epi16(in0, multiplier);
+  const __m128i u1 = _mm_madd_epi16(in1, multiplier);
+  const __m128i v0 = _mm_add_epi32(u0, rounding);
+  const __m128i v1 = _mm_add_epi32(u1, rounding);
+  const __m128i w0 = _mm_srai_epi32(v0, shift);
+  const __m128i w1 = _mm_srai_epi32(v1, shift);
+  return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+    const __m128i in00, const __m128i in01,
+    const __m128i in02, const __m128i in03,
+    const __m128i in04, const __m128i in05,
+    const __m128i in06, const __m128i in07,
+    const int pass, int16_t* out0_ptr,
+    tran_low_t* out1_ptr) {
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in00, in01);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in02, in03);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in00, in01);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in02, in03);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in04, in05);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in06, in07);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in04, in05);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in06, in07);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 54 54 55 55 56 56 57 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 21 36
+  // 44 54 64 74 45 55 61 76
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+  if (pass == 0) {
+    _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+  } else {
+    storeu_output(tr2_0, (out1_ptr + 0 * 16));
+    storeu_output(tr2_1, (out1_ptr + 1 * 16));
+    storeu_output(tr2_2, (out1_ptr + 2 * 16));
+    storeu_output(tr2_3, (out1_ptr + 3 * 16));
+    storeu_output(tr2_4, (out1_ptr + 4 * 16));
+    storeu_output(tr2_5, (out1_ptr + 5 * 16));
+    storeu_output(tr2_6, (out1_ptr + 6 * 16));
+    storeu_output(tr2_7, (out1_ptr + 7 * 16));
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_X86_VP9_DCT_SSE2_H_
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@ -135,7 +135,9 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c

 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c