Merge "Implement sse2 and ssse3 versions for all sub_pixel_variance sizes."

2013-06-20 17:42:50 -07:00 · 2013-06-20 17:42:50 -07:00 · e6cd5ed307
commit e6cd5ed307
parent 84490a1f3d 8fb6c58191
11 changed files with 1488 additions and 1860 deletions
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@ -26,12 +26,55 @@ extern "C" {
 # include "vp9_rtcd.h"
 #endif
 }
+#include "test/acm_random.h"

 namespace {

 using ::std::tr1::get;
 using ::std::tr1::make_tuple;
 using ::std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
+                                 int l2w, int l2h, unsigned int *sse_ptr) {
+  int se = 0;
+  unsigned int sse = 0;
+  const int w = 1 << l2w, h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      int diff = ref[w * y + x] - src[w * y + x];
+      se += diff;
+      sse += diff * diff;
+    }
+  }
+  *sse_ptr = sse;
+  return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
+static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                        int l2w, int l2h, int xoff, int yoff,
+                                        unsigned int *sse_ptr) {
+  int se = 0;
+  unsigned int sse = 0;
+  const int w = 1 << l2w, h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+      const int r = a + (((b - a) * yoff + 8) >> 4);
+      int diff = r - src[w * y + x];
+      se += diff;
+      sse += diff * diff;
+    }
+  }
+  *sse_ptr = sse;
+  return sse - (((int64_t) se * se) >> (l2w + l2h));
+}

 template<typename VarianceFunctionType>
 class VarianceTest :
@ -39,10 +82,13 @@ class VarianceTest :
 public:
  virtual void SetUp() {
    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
-    width_  = get<0>(params);
-    height_ = get<1>(params);
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
    variance_ = get<2>(params);

+    rnd(ACMRandom::DeterministicSeed());
    block_size_ = width_ * height_;
    src_ = new uint8_t[block_size_];
    ref_ = new uint8_t[block_size_];
@ -58,15 +104,16 @@ class VarianceTest :

 protected:
  void ZeroTest();
+  void RefTest();
  void OneQuarterTest();

+  ACMRandom rnd;
  uint8_t* src_;
  uint8_t* ref_;
-  int width_;
-  int height_;
+  int width_, log2width_;
+  int height_, log2height_;
  int block_size_;
  VarianceFunctionType variance_;
-
 };

 template<typename VarianceFunctionType>
@ -82,6 +129,22 @@ void VarianceTest<VarianceFunctionType>::ZeroTest() {
  }
 }

+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefTest() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size_; j++) {
+      src_[j] = rnd.Rand8();
+      ref_[j] = rnd.Rand8();
+    }
+    unsigned int sse1, sse2;
+    const unsigned int var1 = variance_(src_, width_, ref_, width_, &sse1);
+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+                                           log2height_, &sse2);
+    EXPECT_EQ(sse1, sse2);
+    EXPECT_EQ(var1, var2);
+  }
+}
+
 template<typename VarianceFunctionType>
 void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
  memset(src_, 255, block_size_);
@ -94,6 +157,66 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
  EXPECT_EQ(expected, var);
 }

+template<typename SubpelVarianceFunctionType>
+class SubpelVarianceTest :
+    public ::testing::TestWithParam<tuple<int, int,
+                                          SubpelVarianceFunctionType> > {
+ public:
+  virtual void SetUp() {
+    const tuple<int, int, SubpelVarianceFunctionType>& params =
+        this->GetParam();
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
+    subpel_variance_ = get<2>(params);
+
+    rnd(ACMRandom::DeterministicSeed());
+    block_size_ = width_ * height_;
+    src_ = new uint8_t[block_size_];
+    ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    delete[] src_;
+    delete[] ref_;
+  }
+
+ protected:
+  void RefTest();
+
+  ACMRandom rnd;
+  uint8_t* src_;
+  uint8_t* ref_;
+  int width_, log2width_;
+  int height_, log2height_;
+  int block_size_;
+  SubpelVarianceFunctionType subpel_variance_;
+};
+
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
+  for (int x = 0; x < 16; ++x) {
+    for (int y = 0; y < 16; ++y) {
+      for (int j = 0; j < block_size_; j++) {
+        src_[j] = rnd.Rand8();
+      }
+      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        ref_[j] = rnd.Rand8();
+      }
+      unsigned int sse1, sse2;
+      const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                                 src_, width_, &sse1);
+      const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
+                                                    log2height_, x, y, &sse2);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
 // -----------------------------------------------------------------------------
 // VP8 test cases.

@ -103,6 +226,7 @@ namespace vp8 {
 typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;

 TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP8VarianceTest, Ref) { RefTest(); }
 TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }

 const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
@ -112,11 +236,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
 const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
 INSTANTIATE_TEST_CASE_P(
    C, VP8VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_c),
-                      make_tuple(8, 8, variance8x8_c),
-                      make_tuple(8, 16, variance8x16_c),
-                      make_tuple(16, 8, variance16x8_c),
-                      make_tuple(16, 16, variance16x16_c)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_c),
+                      make_tuple(3, 3, variance8x8_c),
+                      make_tuple(3, 4, variance8x16_c),
+                      make_tuple(4, 3, variance16x8_c),
+                      make_tuple(4, 4, variance16x16_c)));

 #if HAVE_MMX
 const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
@ -126,11 +250,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
 const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
 INSTANTIATE_TEST_CASE_P(
    MMX, VP8VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_mmx),
-                      make_tuple(8, 8, variance8x8_mmx),
-                      make_tuple(8, 16, variance8x16_mmx),
-                      make_tuple(16, 8, variance16x8_mmx),
-                      make_tuple(16, 16, variance16x16_mmx)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
+                      make_tuple(3, 3, variance8x8_mmx),
+                      make_tuple(3, 4, variance8x16_mmx),
+                      make_tuple(4, 3, variance16x8_mmx),
+                      make_tuple(4, 4, variance16x16_mmx)));
 #endif

 #if HAVE_SSE2
@ -141,11 +265,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
 const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
 INSTANTIATE_TEST_CASE_P(
    SSE2, VP8VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
-                      make_tuple(8, 8, variance8x8_wmt),
-                      make_tuple(8, 16, variance8x16_wmt),
-                      make_tuple(16, 8, variance16x8_wmt),
-                      make_tuple(16, 16, variance16x16_wmt)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_wmt),
+                      make_tuple(3, 3, variance8x8_wmt),
+                      make_tuple(3, 4, variance8x16_wmt),
+                      make_tuple(4, 3, variance16x8_wmt),
+                      make_tuple(4, 4, variance16x16_wmt)));
 #endif
 #endif  // CONFIG_VP8_ENCODER

@ -158,22 +282,83 @@ namespace vp9 {

 #if CONFIG_VP9_ENCODER
 typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
+typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;

 TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP9VarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }

 const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
+const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
+const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
 const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
 const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
 const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
 const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
+const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
+const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
+const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
+const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
+const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
+const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
 INSTANTIATE_TEST_CASE_P(
    C, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_c),
-                      make_tuple(8, 8, variance8x8_c),
-                      make_tuple(8, 16, variance8x16_c),
-                      make_tuple(16, 8, variance16x8_c),
-                      make_tuple(16, 16, variance16x16_c)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_c),
+                      make_tuple(2, 3, variance4x8_c),
+                      make_tuple(3, 2, variance8x4_c),
+                      make_tuple(3, 3, variance8x8_c),
+                      make_tuple(3, 4, variance8x16_c),
+                      make_tuple(4, 3, variance16x8_c),
+                      make_tuple(4, 4, variance16x16_c),
+                      make_tuple(4, 5, variance16x32_c),
+                      make_tuple(5, 4, variance32x16_c),
+                      make_tuple(5, 5, variance32x32_c),
+                      make_tuple(5, 6, variance32x64_c),
+                      make_tuple(6, 5, variance64x32_c),
+                      make_tuple(6, 6, variance64x64_c)));
+
+const vp9_subpixvariance_fn_t subpel_variance4x4_c =
+    vp9_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t subpel_variance4x8_c =
+    vp9_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t subpel_variance8x4_c =
+    vp9_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t subpel_variance8x8_c =
+    vp9_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t subpel_variance8x16_c =
+    vp9_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t subpel_variance16x8_c =
+    vp9_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t subpel_variance16x16_c =
+    vp9_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t subpel_variance16x32_c =
+    vp9_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t subpel_variance32x16_c =
+    vp9_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t subpel_variance32x32_c =
+    vp9_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t subpel_variance32x64_c =
+    vp9_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t subpel_variance64x32_c =
+    vp9_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t subpel_variance64x64_c =
+    vp9_sub_pixel_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
+                      make_tuple(2, 3, subpel_variance4x8_c),
+                      make_tuple(3, 2, subpel_variance8x4_c),
+                      make_tuple(3, 3, subpel_variance8x8_c),
+                      make_tuple(3, 4, subpel_variance8x16_c),
+                      make_tuple(4, 3, subpel_variance16x8_c),
+                      make_tuple(4, 4, subpel_variance16x16_c),
+                      make_tuple(4, 5, subpel_variance16x32_c),
+                      make_tuple(5, 4, subpel_variance32x16_c),
+                      make_tuple(5, 5, subpel_variance32x32_c),
+                      make_tuple(5, 6, subpel_variance32x64_c),
+                      make_tuple(6, 5, subpel_variance64x32_c),
+                      make_tuple(6, 6, subpel_variance64x64_c)));

 #if HAVE_MMX
 const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
@ -183,26 +368,128 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
 const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
 INSTANTIATE_TEST_CASE_P(
    MMX, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_mmx),
-                      make_tuple(8, 8, variance8x8_mmx),
-                      make_tuple(8, 16, variance8x16_mmx),
-                      make_tuple(16, 8, variance16x8_mmx),
-                      make_tuple(16, 16, variance16x16_mmx)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
+                      make_tuple(3, 3, variance8x8_mmx),
+                      make_tuple(3, 4, variance8x16_mmx),
+                      make_tuple(4, 3, variance16x8_mmx),
+                      make_tuple(4, 4, variance16x16_mmx)));
 #endif

 #if HAVE_SSE2
-const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2;
-const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2;
-const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2;
-const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2;
-const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2;
+const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
+const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
+const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
+const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
+const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
+const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
+const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
+const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
+const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
+const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
+const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
+const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
+const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
 INSTANTIATE_TEST_CASE_P(
    SSE2, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
-                      make_tuple(8, 8, variance8x8_wmt),
-                      make_tuple(8, 16, variance8x16_wmt),
-                      make_tuple(16, 8, variance16x8_wmt),
-                      make_tuple(16, 16, variance16x16_wmt)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_sse2),
+                      make_tuple(2, 3, variance4x8_sse2),
+                      make_tuple(3, 2, variance8x4_sse2),
+                      make_tuple(3, 3, variance8x8_sse2),
+                      make_tuple(3, 4, variance8x16_sse2),
+                      make_tuple(4, 3, variance16x8_sse2),
+                      make_tuple(4, 4, variance16x16_sse2),
+                      make_tuple(4, 5, variance16x32_sse2),
+                      make_tuple(5, 4, variance32x16_sse2),
+                      make_tuple(5, 5, variance32x32_sse2),
+                      make_tuple(5, 6, variance32x64_sse2),
+                      make_tuple(6, 5, variance64x32_sse2),
+                      make_tuple(6, 6, variance64x64_sse2)));
+
+const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
+    vp9_sub_pixel_variance4x4_sse;
+const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
+    vp9_sub_pixel_variance4x8_sse;
+const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 =
+    vp9_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 =
+    vp9_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 =
+    vp9_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 =
+    vp9_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 =
+    vp9_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 =
+    vp9_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 =
+    vp9_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 =
+    vp9_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 =
+    vp9_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 =
+    vp9_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 =
+    vp9_sub_pixel_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
+                      make_tuple(2, 3, subpel_variance4x8_sse),
+                      make_tuple(3, 2, subpel_variance8x4_sse2),
+                      make_tuple(3, 3, subpel_variance8x8_sse2),
+                      make_tuple(3, 4, subpel_variance8x16_sse2),
+                      make_tuple(4, 3, subpel_variance16x8_sse2),
+                      make_tuple(4, 4, subpel_variance16x16_sse2),
+                      make_tuple(4, 5, subpel_variance16x32_sse2),
+                      make_tuple(5, 4, subpel_variance32x16_sse2),
+                      make_tuple(5, 5, subpel_variance32x32_sse2),
+                      make_tuple(5, 6, subpel_variance32x64_sse2),
+                      make_tuple(6, 5, subpel_variance64x32_sse2),
+                      make_tuple(6, 6, subpel_variance64x64_sse2)));
+#endif
+
+#if HAVE_SSSE3
+const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
+    vp9_sub_pixel_variance4x4_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
+    vp9_sub_pixel_variance4x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 =
+    vp9_sub_pixel_variance8x4_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 =
+    vp9_sub_pixel_variance8x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 =
+    vp9_sub_pixel_variance8x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 =
+    vp9_sub_pixel_variance16x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 =
+    vp9_sub_pixel_variance16x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 =
+    vp9_sub_pixel_variance16x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 =
+    vp9_sub_pixel_variance32x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 =
+    vp9_sub_pixel_variance32x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 =
+    vp9_sub_pixel_variance32x64_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 =
+    vp9_sub_pixel_variance64x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 =
+    vp9_sub_pixel_variance64x64_ssse3;
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
+                      make_tuple(2, 3, subpel_variance4x8_ssse3),
+                      make_tuple(3, 2, subpel_variance8x4_ssse3),
+                      make_tuple(3, 3, subpel_variance8x8_ssse3),
+                      make_tuple(3, 4, subpel_variance8x16_ssse3),
+                      make_tuple(4, 3, subpel_variance16x8_ssse3),
+                      make_tuple(4, 4, subpel_variance16x16_ssse3),
+                      make_tuple(4, 5, subpel_variance16x32_ssse3),
+                      make_tuple(5, 4, subpel_variance32x16_ssse3),
+                      make_tuple(5, 5, subpel_variance32x32_ssse3),
+                      make_tuple(5, 6, subpel_variance32x64_ssse3),
+                      make_tuple(6, 5, subpel_variance64x32_ssse3),
+                      make_tuple(6, 6, subpel_variance64x64_ssse3)));
 #endif
 #endif  // CONFIG_VP9_ENCODER

--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -266,85 +266,81 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
 specialize vp9_variance4x4 mmx sse2

 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 sse2
+specialize vp9_sub_pixel_variance64x64 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance64x64

 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64
+specialize vp9_sub_pixel_variance32x64 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance32x64

 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32
+specialize vp9_sub_pixel_variance64x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance64x32

 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16
+specialize vp9_sub_pixel_variance32x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance32x16

 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32
+specialize vp9_sub_pixel_variance16x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance16x32

 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 sse2
+specialize vp9_sub_pixel_variance32x32 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance32x32

 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+specialize vp9_sub_pixel_variance16x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance16x16

 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 mmx
-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+specialize vp9_sub_pixel_variance8x16 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance8x16

 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+specialize vp9_sub_pixel_variance16x8 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance16x8

 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 mmx
-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+specialize vp9_sub_pixel_variance8x8 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance8x8

 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
 prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4
+specialize vp9_sub_pixel_variance8x4 sse2 ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance8x4

 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8
+specialize vp9_sub_pixel_variance4x8 sse ssse3

 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance4x8

 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse2 mmx
-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+specialize vp9_sub_pixel_variance4x4 sse ssse3
+#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

 prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance4x4
@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co
 specialize vp9_sad4x4 mmx sse

 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h mmx sse2
+specialize vp9_variance_halfpixvar16x16_h sse2
 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt

 prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v mmx sse2
+specialize vp9_variance_halfpixvar16x16_v sse2
 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt

 prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv mmx sse2
+specialize vp9_variance_halfpixvar16x16_hv sse2
 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt

 prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse

-prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-specialize vp9_sub_pixel_mse16x16 sse2 mmx
+#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+#specialize vp9_sub_pixel_mse16x16 sse2 mmx

 prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
 specialize vp9_mse16x16 mmx sse2
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
@ -8,292 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;

-
 %include "vpx_ports/x86_abi_support.asm"

-%define xmm_filter_shift            7
-
-;void vp9_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp9_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 ;void vp9_half_horiz_vert_variance16x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
    UNSHADOW_ARGS
    pop         rbp
    ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
    UNSHADOW_ARGS
    pop         rbp
    ret
-
-%define mmx_filter_shift            7
-
-;void vp9_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;void vp9_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@ -11,8 +11,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-%define xmm_filter_shift            7
-
 ;unsigned int vp9_get_mb_ss_sse2
 ;(
 ;    short *src_ptr
@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2):
    UNSHADOW_ARGS
    pop         rbp
    ret
-
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
@ -1,372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-
-;void vp9_filter_block2d_bil_var_ssse3
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp9_filter_block2d_bil_var_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6
-        pxor            xmm7,           xmm7
-
-        lea             rcx,            [GLOBAL(bilinear_filters_ssse3)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              .filter_block2d_bil_var_ssse3_sp_only
-
-        shl             rax,            4                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              .filter_block2d_bil_var_ssse3_fp_only
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        movdqu          xmm0,           XMMWORD PTR [rsi]
-        movdqu          xmm1,           XMMWORD PTR [rsi+1]
-        movdqa          xmm2,           xmm0
-
-        punpcklbw       xmm0,           xmm1
-        punpckhbw       xmm2,           xmm1
-        pmaddubsw       xmm0,           [rax]
-        pmaddubsw       xmm2,           [rax]
-
-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm0,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        packuswb        xmm0,           xmm2
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        lea             rsi,            [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-        packuswb        xmm1,           xmm3
-
-        movdqa          xmm2,           xmm0
-        movdqa          xmm0,           xmm1
-        movdqa          xmm3,           xmm2
-
-        punpcklbw       xmm2,           xmm1
-        punpckhbw       xmm3,           xmm1
-        pmaddubsw       xmm2,           [rdx]
-        pmaddubsw       xmm3,           [rdx]
-
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm2,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm1,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm1,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm2,           xmm1
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm2
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm2,           xmm2
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm2
-        paddd           xmm7,           xmm3
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rsi,            [rsi + r8]
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_var_ssse3_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              .filter_block2d_bil_var_ssse3_full_pixel
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqa          xmm0,           xmm1
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        lea             rsi,            [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-        movdqa          xmm2,           xmm1
-        movdqa          xmm0,           xmm3
-
-        punpcklbw       xmm1,           xmm3
-        punpckhbw       xmm2,           xmm3
-        pmaddubsw       xmm1,           [rdx]
-        pmaddubsw       xmm2,           [rdx]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        movq            xmm3,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm3,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        movdqa          xmm1,           xmm0
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_sp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0
-
-.filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]
-        punpcklbw       xmm1,           xmm0
-        movq            xmm2,           QWORD PTR [rsi+8]
-        punpcklbw       xmm2,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]
-        punpcklbw       xmm3,           xmm0
-        movq            xmm4,           QWORD PTR [rdi+8]
-        punpcklbw       xmm4,           xmm0
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm4
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_full_pixel_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm2,           XMMWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm2,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm2
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm3
-
-        lea             rsi,            [rsi + rdx]
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_fp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
-        pxor        xmm0,           xmm0
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(7) ;[Sum]
-        mov         rdi,            arg(8) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db  96, 32
-    times 8 db  88, 40
-    times 8 db  80, 48
-    times 8 db  72, 56
-    times 8 db  64, 64
-    times 8 db  56, 72
-    times 8 db  48, 80
-    times 8 db  40, 88
-    times 8 db  32, 96
-    times 8 db  24, 104
-    times 8 db  16, 112
-    times 8 db   8, 120
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@ -13,27 +13,6 @@
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"

-extern void filter_block1d_h6_mmx
-(
-  const unsigned char *src_ptr,
-  unsigned short *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-extern void filter_block1d_v6_mmx
-(
-  const short *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-
 extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
 extern unsigned int vp9_get8x8var_mmx
 (
@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx
  unsigned int *SSE,
  int *Sum
 );
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-

 unsigned int vp9_variance4x4_mmx(
  const unsigned char *src_ptr,
@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx(
  return (var - (((unsigned int)avg * avg) >> 7));

 }
-
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
-unsigned int vp9_sub_pixel_variance4x4_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse)
-
-{
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp9_sub_pixel_mse16x16_mmx(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
-                                         ref_ptr, recon_stride, sse);
-}
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@ -9,29 +9,11 @@
 */

 #include "vpx_config.h"
+
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"

-#define HALFNDX 8
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-
 extern unsigned int vp9_get4x4var_mmx
 (
  const unsigned char *src_ptr,
@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2
  unsigned int *SSE,
  int *Sum
 );
-void vp9_filter_block2d_bil_var_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
 void vp9_half_horiz_vert_variance8x_h_sse2
 (
  const unsigned char *ref_ptr,
@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2
  unsigned int *sumsquared
 );

-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
 typedef unsigned int (*get_var_sse2) (
  const unsigned char *src_ptr,
  int source_stride,
@ -375,347 +343,89 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
  return (var - (((int64_t)avg * avg) >> 11));
 }

-unsigned int vp9_sub_pixel_variance4x4_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+#define DECLS(opt1, opt2) \
+int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \
+                                     ptrdiff_t src_stride, \
+                                     int x_offset, int y_offset, \
+                                     const uint8_t *dst, \
+                                     ptrdiff_t dst_stride, \
+                                     int height, unsigned int *sse); \
+int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \
+                                     ptrdiff_t src_stride, \
+                                     int x_offset, int y_offset, \
+                                     const uint8_t *dst, \
+                                     ptrdiff_t dst_stride, \
+                                     int height, unsigned int *sse); \
+int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \
+                                      ptrdiff_t src_stride, \
+                                      int x_offset, int y_offset, \
+                                      const uint8_t *dst, \
+                                      ptrdiff_t dst_stride, \
+                                      int height, unsigned int *sse)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                y_offset, dst, dst_stride, \
+                                                h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                   x_offset, y_offset, \
+                                                   dst + 16, dst_stride, \
+                                                   h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }

+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16,  8, 16, 4, 3, opt1,); \
+FN(8,  16,  8, 3, 4, opt1,); \
+FN(8,   8,  8, 3, 3, opt1,); \
+FN(8,   4,  8, 3, 2, opt1,); \
+FN(4,   8,  4, 2, 3, opt2,); \
+FN(4,   4,  4, 2, 2, opt2,)

-unsigned int vp9_sub_pixel_variance8x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
-                                         int src_pixels_per_line,
-                                         int xoffset,
-                                         int yoffset,
-                                         const uint8_t *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse, int *avg) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0
-    );
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum1, &xxsum1
-    );
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  *avg = xsum0;
-}
-
-unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg;
-  unsigned int sse;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse, &avg);
-  *sse_ptr = sse;
-
-  return (sse - (((unsigned int) avg * avg) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg0, avg1, avg2, avg3;
-  unsigned int sse0, sse1, sse2, sse3;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse0, &avg0);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse1, &avg1);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sse0 += sse1 + sse2 + sse3;
-  avg0 += avg1 + avg2 + avg3;
-  *sse_ptr = sse0;
-
-  return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
-}
-
-unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg0, avg1, avg2, avg3, avg4;
-  unsigned int sse0, sse1, sse2, sse3, sse4;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse0, &avg0);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse3, &avg3);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3;
-  sse0 += sse1 + sse2 + sse3;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  *sse_ptr = sse0;
-
-  return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
-}
-
-unsigned int vp9_sub_pixel_mse16x16_sse2(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                                   yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum1, &xxsum1);
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
+FNS(sse2, sse);
+FNS(ssse3, ssse3);

+#undef FNS
+#undef FN

 unsigned int vp9_variance_halfpixvar16x16_h_wmt(
  const unsigned char *src_ptr,
--- a/vp9/encoder/x86/vp9_variance_ssse3.c
+++ b/vp9/encoder/x86/vp9_variance_ssse3.c
@ -1,142 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern void vp9_half_horiz_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_horiz_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_ssse3
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
-
-unsigned int vp9_sub_pixel_variance16x16_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance16x8_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@ -85,13 +85,12 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm