HBD convolution filtering (10/12 taps) SSE4.1 optimization

- For experiment EXT_INTERP under high bit depth. - Add unit test to verify bit-exact. - Speed performance improvement: On Xeon E5-2680, park_joy_1080p_12.y4m, 50 frames, encoding time drops from 6682503 ms to 5390270 ms. Change-Id: Iea4debf5414f3accf1eb5672abeab56a0539ac77
2016-07-08 15:41:59 -07:00
parent 1178f71d99
commit 8cacca73bf
9 changed files with 1928 additions and 532 deletions
--- a/test/vp10_convolve_optimz_test.cc
+++ b/test/vp10_convolve_optimz_test.cc
@@ -24,12 +24,25 @@ using libvpx_test::ACMRandom;
 typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
                              int, int, const InterpFilterParams,
                              const int, int, int);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*hbd_conv_filter_t)(const uint16_t*, int, uint16_t*, int,
+                                  int, int, const InterpFilterParams,
+                                  const int, int, int, int);
+#endif
+
 // Test parameter list:
 //  <convolve_horiz_func, convolve_vert_func,
 //  <width, height>, filter_params, subpel_x_q4, avg>
 typedef tuple<int, int> BlockDimension;
 typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
              int, int> ConvParams;
+#if CONFIG_VP9_HIGHBITDEPTH
+// Test parameter list:
+//  <convolve_horiz_func, convolve_vert_func,
+//  <width, height>, filter_params, subpel_x_q4, avg, bit_dpeth>
+typedef tuple<hbd_conv_filter_t, hbd_conv_filter_t, BlockDimension,
+              INTERP_FILTER, int, int, int> HbdConvParams;
+#endif

 // Note:
 //  src_ and src_ref_ have special boundary requirement
@@ -75,11 +88,8 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
  void RunVertFilterBitExactCheck();

 private:
-  void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
-                        uint8_t *dst, uint8_t *dst_ref,
-                        int w, int h);
-  void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
-                        int w, int h, int fgroup, int findex);
+  void PrepFilterBuffer(int w, int h);
+  void DiffFilterBuffer();
  conv_filter_t conv_horiz_;
  conv_filter_t conv_vert_;
  uint8_t *alloc_;
@@ -94,18 +104,16 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
  int avg_;
 };

-void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
-                                              uint8_t *dst, uint8_t *dst_ref,
-                                              int w, int h) {
+void VP10ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
  int r, c;
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));

-  uint8_t *src_ptr = src;
-  uint8_t *dst_ptr = dst;
-  uint8_t *src_ref_ptr = src_ref;
-  uint8_t *dst_ref_ptr = dst_ref;
+  uint8_t *src_ptr = src_;
+  uint8_t *dst_ptr = dst_;
+  uint8_t *src_ref_ptr = src_ref_;
+  uint8_t *dst_ref_ptr = dst_ref_;

  for (r = 0; r < height_; ++r) {
    for (c = 0; c < width_; ++c) {
@@ -121,21 +129,17 @@ void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
  }
 }

-void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
-                                              const uint8_t *buf_ref,
-                                              int w, int h,
-                                              int filter_group,
-                                              int filter_index) {
+void VP10ConvolveOptimzTest::DiffFilterBuffer() {
  int r, c;
-  const uint8_t *dst_ptr = buf;
-  const uint8_t *dst_ref_ptr = buf_ref;
-  for (r = 0; r < h; ++r) {
-    for (c = 0; c < w; ++c) {
+  const uint8_t *dst_ptr = dst_;
+  const uint8_t *dst_ref_ptr = dst_ref_;
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
      EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
      << "Error at row: " << r << " col: " << c << " "
-      << "w = " << w << " " << "h = " << h << " "
-      << "filter group index = " << filter_group << " "
-      << "filter index = " << filter_index;
+      << "w = " << width_ << " " << "h = " << height_ << " "
+      << "filter group index = " << filter_ << " "
+      << "filter index = " << subpel_;
    }
    dst_ptr += stride;
    dst_ref_ptr += stride;
@@ -143,7 +147,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
 }

 void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);

  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);

@@ -153,14 +157,14 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
  conv_horiz_(src_, stride, dst_, stride, width_, height_,
              filter_params, subpel_, x_step_q4, avg_);

-  DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
+  DiffFilterBuffer();

  // Note:
  // Here we need calculate a height which is different from the specified one
  // and test again.
  int intermediate_height =
      (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);

  vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
                        intermediate_height, filter_params, subpel_, x_step_q4,
@@ -170,12 +174,11 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
              intermediate_height, filter_params, subpel_, x_step_q4,
              avg_);

-  DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
-                   subpel_);
+  DiffFilterBuffer();
 }

 void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);

  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);

@@ -185,7 +188,7 @@ void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
  conv_vert_(src_, stride, dst_, stride, width_, height_,
             filter_params, subpel_, x_step_q4, avg_);

-  DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
+  DiffFilterBuffer();
 }

 TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
@@ -197,7 +200,7 @@ TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) {

 using std::tr1::make_tuple;

-#if HAVE_SSSE3 && CONFIG_EXT_INTERP
+#if (HAVE_SSSE3 || HAVE_SSE4_1) && CONFIG_EXT_INTERP
 const BlockDimension kBlockDim[] = {
  make_tuple(2, 2),
  make_tuple(2, 4),
@@ -225,7 +228,9 @@ const INTERP_FILTER kFilter[] = {6, 4, 2};
 const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};

 const int kAvg[] = {0, 1};
+#endif

+#if HAVE_SSSE3 && CONFIG_EXT_INTERP
 INSTANTIATE_TEST_CASE_P(
    SSSE3, VP10ConvolveOptimzTest,
    ::testing::Combine(
@@ -236,4 +241,167 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::ValuesIn(kSubpelQ4),
         ::testing::ValuesIn(kAvg)));
 #endif  // HAVE_SSSE3 && CONFIG_EXT_INTERP
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef ::testing::TestWithParam<HbdConvParams> TestWithHbdConvParams;
+class VP10HbdConvolveOptimzTest : public TestWithHbdConvParams {
+ public:
+  virtual ~VP10HbdConvolveOptimzTest() {}
+  virtual void SetUp() {
+    conv_horiz_ = GET_PARAM(0);
+    conv_vert_ = GET_PARAM(1);
+    BlockDimension block = GET_PARAM(2);
+    width_ = std::tr1::get<0>(block);
+    height_ = std::tr1::get<1>(block);
+    filter_ = GET_PARAM(3);
+    subpel_ = GET_PARAM(4);
+    avg_ = GET_PARAM(5);
+    bit_depth_ = GET_PARAM(6);
+
+    alloc_ = new uint16_t[maxBlockSize * 4];
+    src_ = alloc_ + (vertiOffset * maxWidth);
+    src_ += horizOffset;
+    src_ref_ = src_ + maxBlockSize;
+
+    dst_ = alloc_ + 2 * maxBlockSize;
+    dst_ref_ = alloc_ + 3 * maxBlockSize;
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunHorizFilterBitExactCheck();
+  void RunVertFilterBitExactCheck();
+
+ private:
+  void PrepFilterBuffer(int w, int h);
+  void DiffFilterBuffer();
+  hbd_conv_filter_t conv_horiz_;
+  hbd_conv_filter_t conv_vert_;
+  uint16_t *alloc_;
+  uint16_t *src_;
+  uint16_t *dst_;
+  uint16_t *src_ref_;
+  uint16_t *dst_ref_;
+  int width_;
+  int height_;
+  int filter_;
+  int subpel_;
+  int avg_;
+  int bit_depth_;
+};
+
+void VP10HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+  int r, c;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
+
+  uint16_t *src_ptr = src_;
+  uint16_t *dst_ptr = dst_;
+  uint16_t *dst_ref_ptr = dst_ref_;
+  uint16_t hbd_mask = (1 << bit_depth_) - 1;
+
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      src_ptr[c] = rnd.Rand16() & hbd_mask;
+      dst_ptr[c] = rnd.Rand16() & hbd_mask;
+      dst_ref_ptr[c] = dst_ptr[c];
+    }
+    src_ptr += stride;
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10HbdConvolveOptimzTest::DiffFilterBuffer() {
+  int r, c;
+  const uint16_t *dst_ptr = dst_;
+  const uint16_t *dst_ref_ptr = dst_ref_;
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      EXPECT_EQ((uint16_t)dst_ref_ptr[c], (uint16_t)dst_ptr[c])
+      << "Error at row: " << r << " col: " << c << " "
+      << "w = " << width_ << " " << "h = " << height_ << " "
+      << "filter group index = " << filter_ << " "
+      << "filter index = " << subpel_ << " "
+      << "bit depth = " << bit_depth_;
+    }
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
+                               height_, filter_params, subpel_, x_step_q4,
+                               avg_, bit_depth_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_, height_,
+              filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+
+  // Note:
+  // Here we need calculate a height which is different from the specified one
+  // and test again.
+  int intermediate_height =
+      (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
+                               intermediate_height, filter_params, subpel_,
+                               x_step_q4, avg_, bit_depth_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_, intermediate_height,
+              filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+}
+
+void VP10HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_highbd_convolve_vert_c(src_, stride, dst_ref_, stride, width_, height_,
+                              filter_params, subpel_, x_step_q4, avg_,
+                              bit_depth_);
+
+  conv_vert_(src_, stride, dst_, stride, width_, height_,
+             filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+}
+
+TEST_P(VP10HbdConvolveOptimzTest, HorizBitExactCheck) {
+  RunHorizFilterBitExactCheck();
+}
+TEST_P(VP10HbdConvolveOptimzTest, VertBitExactCheck) {
+  RunVertFilterBitExactCheck();
+}
+
+#if HAVE_SSE4_1 && CONFIG_EXT_INTERP
+
+const int kBitdepth[] = {10, 12};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HbdConvolveOptimzTest,
+    ::testing::Combine(
+         ::testing::Values(vp10_highbd_convolve_horiz_sse4_1),
+         ::testing::Values(vp10_highbd_convolve_vert_sse4_1),
+         ::testing::ValuesIn(kBlockDim),
+         ::testing::ValuesIn(kFilter),
+         ::testing::ValuesIn(kSubpelQ4),
+         ::testing::ValuesIn(kAvg),
+         ::testing::ValuesIn(kBitdepth)));
+#endif  // HAVE_SSE4_1 && CONFIG_EXT_INTERP
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/vp10/common/filter.c
+++ b/vp10/common/filter.c
@@ -342,3 +342,25 @@ SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
  (void)index;
  return NULL;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index) {
+#if CONFIG_EXT_INTERP && HAVE_SSE4_1
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
+    return &sub_pel_filters_12sharp_highbd_ver_signal_dir[index][0];
+  }
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
+    return &sub_pel_filters_10sharp_highbd_ver_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP && HAVE_SSE4_1
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
+    return &sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+#endif
--- a/vp10/common/filter.h
+++ b/vp10/common/filter.h
@@ -95,6 +95,10 @@ static INLINE int vp10_is_interpolating_filter(
 #if USE_TEMPORALFILTER_12TAP
 extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16];
 extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const
+int16_t sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8];
+#endif
 #endif

 #if CONFIG_EXT_INTERP
@@ -102,15 +106,26 @@ extern const int8_t sub_pel_filters_12sharp_signal_dir[15][2][16];
 extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16];
 extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16];
 extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const int16_t sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8];
+extern const int16_t sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8];
+#endif
 #endif

 typedef const int8_t (*SubpelFilterCoeffs)[16];
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef const int16_t (*HbdSubpelFilterCoeffs)[8];
+#endif

 SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
    const InterpFilterParams p, int index);

 SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
    const InterpFilterParams p, int index);
+#if CONFIG_VP9_HIGHBITDEPTH
+HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index);
+#endif

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp10/common/vp10_convolve.c
+++ b/vp10/common/vp10_convolve.c
@@ -182,7 +182,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint16_t *src, int src_stride,
+void vp10_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
                                  uint16_t *dst, int dst_stride, int w, int h,
                                  const InterpFilterParams filter_params,
                                  const int subpel_x_q4, int x_step_q4, int avg,
@@ -213,7 +213,7 @@ static void highbd_convolve_horiz(const uint16_t *src, int src_stride,
  }
 }

-static void highbd_convolve_vert(const uint16_t *src, int src_stride,
+void vp10_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams filter_params,
                                 const int subpel_y_q4, int y_step_q4, int avg,
@@ -300,8 +300,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
    InterpFilterParams filter_params =
        vp10_get_interp_filter_params(interp_filter);
 #endif
-    highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
-                          subpel_x_q4, x_step_q4, ref_idx, bd);
+    vp10_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
+                               filter_params, subpel_x_q4, x_step_q4, ref_idx,
+                               bd);
  } else if (ignore_horiz) {
 #if CONFIG_DUAL_FILTER
    InterpFilterParams filter_params =
@@ -310,8 +311,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
    InterpFilterParams filter_params =
        vp10_get_interp_filter_params(interp_filter);
 #endif
-    highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_y_q4, y_step_q4, ref_idx, bd);
+    vp10_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
+                              filter_params, subpel_y_q4, y_step_q4, ref_idx,
+                              bd);
  } else {
    // temp's size is set to (maximum possible intermediate_height) *
    // MAX_BLOCK_WIDTH
@@ -336,9 +338,10 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
    int intermediate_height =
        (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;

-    highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
-                          temp, temp_stride, w, intermediate_height,
-                          filter_params, subpel_x_q4, x_step_q4, 0, bd);
+    vp10_highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1),
+                               src_stride, temp, temp_stride, w,
+                               intermediate_height, filter_params, subpel_x_q4,
+                               x_step_q4, 0, bd);

 #if CONFIG_DUAL_FILTER
    filter_params = filter_params_y;
@@ -346,9 +349,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
    filter_size = filter_params.taps;
    assert(filter_params.taps <= MAX_FILTER_TAP);

-    highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
-                         temp_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_y_q4, y_step_q4, ref_idx, bd);
+    vp10_highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
+                              temp_stride, dst, dst_stride, w, h, filter_params,
+                              subpel_y_q4, y_step_q4, ref_idx, bd);
  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -93,6 +93,13 @@ specialize qw/vp10_convolve_horiz ssse3/;
 add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
 specialize qw/vp10_convolve_vert ssse3/;

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp10_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/vp10_highbd_convolve_horiz sse4_1/;
+  add_proto qw/void vp10_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/vp10_highbd_convolve_vert sse4_1/;
+}
+
 #
 # dct
 #
--- a/vp10/common/x86/vp10_convolve_filters_ssse3.c
+++ b/vp10/common/x86/vp10_convolve_filters_ssse3.c
--- a/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
+++ b/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
@@ -0,0 +1,393 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp10/common/filter.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6, 127,  -6, 127,  -6, 127,  -6, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -7,  18,  -7,  18,  -7,  18,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-17, 119, -17, 119, -17, 119, -17, 119, },
+    { 28, -11,  28, -11,  28, -11,  28, -11, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-20, 114, -20, 114, -20, 114, -20, 114, },
+    { 38, -14,  38, -14,  38, -14,  38, -14, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-22, 107, -22, 107, -22, 107, -22, 107, },
+    { 49, -17,  49, -17,  49, -17,  49, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  99, -24,  99, -24,  99, -24,  99, },
+    { 59, -20,  59, -20,  59, -20,  59, -20, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  90, -24,  90, -24,  90, -24,  90, },
+    { 70, -22,  70, -22,  70, -22,  70, -22, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-23,  80, -23,  80, -23,  80, -23,  80, },
+    { 80, -23,  80, -23,  80, -23,  80, -23, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-22,  70, -22,  70, -22,  70, -22,  70, },
+    { 90, -24,  90, -24,  90, -24,  90, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-20,  59, -20,  59, -20,  59, -20,  59, },
+    { 99, -24,  99, -24,  99, -24,  99, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17,  49, -17,  49, -17,  49, -17,  49, },
+    {107, -22, 107, -22, 107, -22, 107, -22, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-14,  38, -14,  38, -14,  38, -14,  38, },
+    {114, -20, 114, -20, 114, -20, 114, -20, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-11,  28, -11,  28, -11,  28, -11,  28, },
+    {119, -17, 119, -17, 119, -17, 119, -17, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7,  18,  -7,  18,  -7,  18,  -7,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -6, 127,  -6, 127,  -6, 127,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+};
+#endif
+#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-13, 124, -13, 124, -13, 124, -13, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-18, 120, -18, 120, -18, 120, -18, 120, },
+    { 28, -12,  28, -12,  28, -12,  28, -12, },
+    {  7,  -4,   7,  -4,   7,  -4,   7,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-21, 115, -21, 115, -21, 115, -21, 115, },
+    { 38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -5,   8,  -5,   8,  -5,   8,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24, 108, -24, 108, -24, 108, -24, 108, },
+    { 49, -18,  49, -18,  49, -18,  49, -18, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25, 100, -25, 100, -25, 100, -25, 100, },
+    { 60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -7,  11,  -7,  11,  -7,  11,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-26,  91, -26,  91, -26,  91, -26,  91, },
+    { 71, -24,  71, -24,  71, -24,  71, -24, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25,  81, -25,  81, -25,  81, -25,  81, },
+    { 81, -25,  81, -25,  81, -25,  81, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-24,  71, -24,  71, -24,  71, -24,  71, },
+    { 91, -26,  91, -26,  91, -26,  91, -26, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  11,  -7,  11,  -7,  11,  -7,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60, },
+    {100, -25, 100, -25, 100, -25, 100, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-18,  49, -18,  49, -18,  49, -18,  49, },
+    {108, -24, 108, -24, 108, -24, 108, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,   8,  -5,   8,  -5,   8,  -5,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38, },
+    {115, -21, 115, -21, 115, -21, 115, -21, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   7,  -4,   7,  -4,   7,  -4,   7, },
+    {-12,  28, -12,  28, -12,  28, -12,  28, },
+    {120, -18, 120, -18, 120, -18, 120, -18, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -13, 124, -13, 124, -13, 124, -13, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
+#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   5,  -3,   5,  -3,   5,  -3,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17, 120, -17, 120, -17, 120, -17, 120, },
+    { 28, -11,  28, -11,  28, -11,  28, -11, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  1,  -1,   1,  -1,   1,  -1,   1,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,  10,  -4,  10,  -4,  10,  -4,  10, },
+    {-21, 114, -21, 114, -21, 114, -21, 114, },
+    { 38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,  11,  -5,  11,  -5,  11,  -5,  11, },
+    {-23, 107, -23, 107, -23, 107, -23, 107, },
+    { 49, -18,  49, -18,  49, -18,  49, -18, },
+    {  9,  -5,   9,  -5,   9,  -5,   9,  -5, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  99, -25,  99, -25,  99, -25,  99, },
+    { 60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -6,  11,  -6,  11,  -6,  11,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  90, -25,  90, -25,  90, -25,  90, },
+    { 70, -23,  70, -23,  70, -23,  70, -23, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24,  80, -24,  80, -24,  80, -24,  80, },
+    { 80, -24,  80, -24,  80, -24,  80, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-23,  70, -23,  70, -23,  70, -23,  70, },
+    { 90, -25,  90, -25,  90, -25,  90, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  11,  -6,  11,  -6,  11,  -6,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60, },
+    { 99, -25,  99, -25,  99, -25,  99, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -5,   9,  -5,   9,  -5,   9,  -5,   9, },
+    {-18,  49, -18,  49, -18,  49, -18,  49, },
+    {107, -23, 107, -23, 107, -23, 107, -23, },
+    { 11,  -5,  11,  -5,  11,  -5,  11,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38, },
+    {114, -21, 114, -21, 114, -21, 114, -21, },
+    { 10,  -4,  10,  -4,  10,  -4,  10,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   1,  -1,   1,  -1,   1,  -1,   1, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-11,  28, -11,  28, -11,  28, -11,  28, },
+    {120, -17, 120, -17, 120, -17, 120, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -3,   5,  -3,   5,  -3,   5,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
+#endif
--- a/vp10/common/x86/vp10_highbd_convolve_sse4.c
+++ b/vp10/common/x86/vp10_highbd_convolve_sse4.c
@@ -0,0 +1,474 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/filter.h"
+
+typedef void (*TransposeSave)(const int width, int pixelsNum,
+                              uint32_t *src, int src_stride,
+                              uint16_t *dst, int dst_stride,
+                              int bd);
+
+// pixelsNum 0: write all 4 pixels
+//           1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum,
+                       uint16_t *dst, int dst_stride) {
+  if (2 == width) {
+    if (0 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+    } else if (1 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+    } else if (2 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+    } else if (3 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+    }
+  } else {
+    if (0 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+    } else if (1 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+    } else if (2 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+    } else if (3 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+    }
+  }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+  int i;
+
+  for (i = 0; i < numVecs; i++) {
+    mask = _mm_cmpgt_epi16(p[i], max);
+    clamped = _mm_andnot_si128(mask, p[i]);
+    mask = _mm_and_si128(mask, max);
+    clamped = _mm_or_si128(mask, clamped);
+    mask = _mm_cmpgt_epi16(clamped, zero);
+    p[i] = _mm_and_si128(clamped, mask);
+  }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+  __m128i v0, v1;
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[1] = _mm_add_epi32(u[1], rnd);
+  u[2] = _mm_add_epi32(u[2], rnd);
+  u[3] = _mm_add_epi32(u[3], rnd);
+
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+  u[0] = _mm_packus_epi32(u[0], u[1]);
+  u[1] = _mm_packus_epi32(u[2], u[3]);
+
+  highbd_clip(u, 2, bd);
+
+  v0 = _mm_unpacklo_epi16(u[0], u[1]);
+  v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(v0, v1);
+  u[2] = _mm_unpackhi_epi16(v0, v1);
+
+  u[1] = _mm_srli_si128(u[0], 8);
+  u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0     : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(const int width, int pixelsNum,
+                    uint32_t *src, int src_stride,
+                    uint16_t *dst, int dst_stride,
+                    int bd) {
+  __m128i u[4];
+  transClipPixel(src, src_stride, u, bd);
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(const int width, int pixelsNum,
+                          uint32_t *src, int src_stride,
+                          uint16_t *dst, int dst_stride,
+                          int bd) {
+  __m128i u[4], v[4];
+  const __m128i ones = _mm_set1_epi16(1);
+
+  transClipPixel(src, src_stride, u, bd);
+
+  v[0] = _mm_loadl_epi64((__m128i const *)dst);
+  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  u[0] = _mm_add_epi16(u[0], v[0]);
+  u[1] = _mm_add_epi16(u[1], v[1]);
+  u[2] = _mm_add_epi16(u[2], v[2]);
+  u[3] = _mm_add_epi16(u[3], v[3]);
+
+  u[0] = _mm_add_epi16(u[0], ones);
+  u[1] = _mm_add_epi16(u[1], ones);
+  u[2] = _mm_add_epi16(u[2], ones);
+  u[3] = _mm_add_epi16(u[3], ones);
+
+  u[0] = _mm_srai_epi16(u[0], 1);
+  u[1] = _mm_srai_epi16(u[1], 1);
+  u[2] = _mm_srai_epi16(u[2], 1);
+  u[3] = _mm_srai_epi16(u[3], 1);
+
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+static TransposeSave transSaveTab[2] = {
+  trans_save_4x4, trans_accum_save_4x4};
+
+static INLINE void transpose_pair(__m128i *in, __m128i *out) {
+  __m128i x0, x1;
+
+  x0 = _mm_unpacklo_epi32(in[0], in[1]);
+  x1 = _mm_unpacklo_epi32(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi64(x0, x1);
+  out[1] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpackhi_epi32(in[0], in[1]);
+  x1 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  out[2] = _mm_unpacklo_epi64(x0, x1);
+  out[3] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpacklo_epi32(in[4], in[5]);
+  x1 = _mm_unpacklo_epi32(in[6], in[7]);
+
+  out[4] = _mm_unpacklo_epi64(x0, x1);
+  out[5] = _mm_unpackhi_epi64(x0, x1);
+}
+
+static void highbd_filter_horiz(const uint16_t *src, int src_stride,
+                                __m128i *f, int tapsNum, uint32_t *buf) {
+  __m128i u[8], v[6];
+
+  if (tapsNum == 10) {
+    src -= 1;
+  }
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
+  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
+  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
+  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
+
+  transpose_pair(u, v);
+
+  u[0] = _mm_madd_epi16(v[0], f[0]);
+  u[1] = _mm_madd_epi16(v[1], f[1]);
+  u[2] = _mm_madd_epi16(v[2], f[2]);
+  u[3] = _mm_madd_epi16(v[3], f[3]);
+  u[4] = _mm_madd_epi16(v[4], f[4]);
+  u[5] = _mm_madd_epi16(v[5], f[5]);
+
+  u[6] = _mm_min_epi32(u[2], u[3]);
+  u[7] = _mm_max_epi32(u[2], u[3]);
+
+  u[0] = _mm_add_epi32(u[0], u[1]);
+  u[0] = _mm_add_epi32(u[0], u[5]);
+  u[0] = _mm_add_epi32(u[0], u[4]);
+  u[0] = _mm_add_epi32(u[0], u[6]);
+  u[0] = _mm_add_epi32(u[0], u[7]);
+
+  _mm_storeu_si128((__m128i *)buf, u[0]);
+}
+
+void vp10_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
+                                       uint16_t *dst, int dst_stride,
+                                       int w, int h,
+                                       const InterpFilterParams filter_params,
+                                       const int subpel_x_q4, int x_step_q4,
+                                       int avg, int bd) {
+  DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const uint16_t *srcPtr;
+  const int tapsNum = filter_params.taps;
+  int i, col, count, blkResidu, blkHeight;
+  TransposeSave transSave = transSaveTab[avg];
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params, subpel_x_q4, x_step_q4, avg,
+                                 bd);
+    return;
+  }
+
+  vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_x_q4 - 1);
+  if (!vCoeffs) {
+    vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params, subpel_x_q4, x_step_q4, avg,
+                                 bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= (tapsNum >> 1) - 1;
+  srcPtr = src;
+
+  count = 0;
+  blkHeight = h >> 2;
+  blkResidu = h & 3;
+
+  while (blkHeight != 0) {
+    for (col = 0; col < w; col += 4) {
+      for (i = 0; i < 4; ++i) {
+        highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+        srcPtr += 1;
+      }
+      transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
+    }
+    count++;
+    srcPtr = src + count * src_stride * 4;
+    dst += dst_stride * 4;
+    blkHeight--;
+  }
+
+  for (col = 0; col < w; col += 4) {
+    for (i = 0; i < 4; ++i) {
+      highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+      srcPtr += 1;
+    }
+    transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
+  }
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = {write2pixelsOnly, write2pixelsAccum};
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = {write4pixelsOnly, write4pixelsAccum};
+
+static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
+                                       const __m128i *f, int taps,
+                                       uint16_t *dst, WritePixels saveFunc,
+                                       int bd) {
+  __m128i s[12];
+  __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  int r = 0;
+
+  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+  if (10 == taps) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi16(s[10], s[11]);
+
+  s[0] = _mm_madd_epi16(s[0], f[0]);
+  s[2] = _mm_madd_epi16(s[2], f[1]);
+  s[4] = _mm_madd_epi16(s[4], f[2]);
+  s[6] = _mm_madd_epi16(s[6], f[3]);
+  s[8] = _mm_madd_epi16(s[8], f[4]);
+  s[10] = _mm_madd_epi16(s[10], f[5]);
+
+  s[1] = _mm_min_epi32(s[4], s[6]);
+  s[3] = _mm_max_epi32(s[4], s[6]);
+
+  s[0] = _mm_add_epi32(s[0], s[2]);
+  s[0] = _mm_add_epi32(s[0], s[10]);
+  s[0] = _mm_add_epi32(s[0], s[8]);
+  s[0] = _mm_add_epi32(s[0], s[1]);
+  s[0] = _mm_add_epi32(s[0], s[3]);
+
+  saveFunc(s, bd, dst);
+}
+
+static void highbd_filter_vert_compute_large(const uint16_t *src,
+                                             int src_stride,
+                                             const __m128i *f, int taps,
+                                             int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int col;
+  int rowIndex = 0;
+  const uint16_t *src_ptr = src;
+  uint16_t *dst_ptr = dst;
+  const int step = 4;
+  WritePixels write4pixels = write4pixelsTab[avg];
+
+  do {
+    for (col = 0; col < w; col += step) {
+      filter_vert_horiz_parallel(src_ptr, src_stride, f, taps,
+                                 dst_ptr, write4pixels, bd);
+      src_ptr += step;
+      dst_ptr += step;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+static void highbd_filter_vert_compute_small(const uint16_t *src,
+                                             int src_stride,
+                                             const __m128i *f, int taps,
+                                             int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int rowIndex = 0;
+  WritePixels write2pixels = write2pixelsTab[avg];
+  (void)w;
+
+  do {
+    filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels,
+                               bd);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+void vp10_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride,
+                                      int w, int h,
+                                      const InterpFilterParams filter_params,
+                                      const int subpel_y_q4, int y_step_q4,
+                                      int avg, int bd) {
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_y_q4, y_step_q4, avg,
+                                bd);
+    return;
+  }
+
+  vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_y_q4 - 1);
+  if (!vCoeffs) {
+    vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_y_q4, y_step_q4, avg,
+                                bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+
+  if (w > 2) {
+    highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h,
+                                     dst, dst_stride, avg, bd);
+  } else {
+    highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h,
+                                     dst, dst_stride, avg, bd);
+  }
+}
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -74,6 +74,10 @@ VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.c
 VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
 VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_ssse3.c
 VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_filters_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_sse4.c
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_filters_sse4.c
+endif
 VP10_COMMON_SRCS-yes += common/vp10_convolve.c
 VP10_COMMON_SRCS-yes += common/vp10_convolve.h
 VP10_COMMON_SRCS-$(CONFIG_ANS) += common/ans.h