Merge "Add SSE4.1 code for deringing functions." into nextgenv2

2016-10-13 18:02:59 +00:00
parent 3feb89170b 7227b65c4c
commit 98e9ce923b
8 changed files with 553 additions and 33 deletions
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -98,6 +98,8 @@ endif
 ifeq ($(CONFIG_DERING),yes)
 AV1_COMMON_SRCS-yes += common/od_dering.c
 AV1_COMMON_SRCS-yes += common/od_dering.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
 AV1_COMMON_SRCS-yes += common/dering.c
 AV1_COMMON_SRCS-yes += common/dering.h
 endif
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -20,6 +20,7 @@ struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
+typedef int16_t od_dering_in;
 EOF
 }
 forward_decls qw/av1_common_forward_decls/;
@@ -840,4 +841,24 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {

 }
 # end encoder functions
+
+# Deringing Functions
+
+if (aom_config("CONFIG_DERING") eq "yes") {
+  add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
+  specialize qw/od_dir_find8 sse4_1/;
+
+  add_proto qw/int od_filter_dering_direction_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_direction_4x4 sse4_1/;
+
+  add_proto qw/int od_filter_dering_direction_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_direction_8x8 sse4_1/;
+
+  add_proto qw/void od_filter_dering_orthogonal_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_orthogonal_4x4 sse4_1/;
+
+  add_proto qw/void od_filter_dering_orthogonal_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_orthogonal_8x8 sse4_1/;
+}
+
 1;
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -111,7 +111,7 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
        if (pli) level = (level * 5 + 4) >> 3;
        if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
        threshold = level << coeff_shift;
-        od_dering(&OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[pli],
+        od_dering(dst, MAX_MIB_SIZE * bsize[pli],
                  &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
                            sbc * bsize[pli] * MAX_MIB_SIZE],
                  stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -15,11 +15,7 @@
 #include <stdlib.h>
 #include <math.h>
 #include "dering.h"
-
-const od_dering_opt_vtbl OD_DERING_VTBL_C = {
-  { od_filter_dering_direction_4x4_c, od_filter_dering_direction_8x8_c },
-  { od_filter_dering_orthogonal_4x4_c, od_filter_dering_orthogonal_8x8_c }
-};
+#include "./av1_rtcd.h"

 /* Generated from gen_filter_tables.c. */
 const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
@@ -42,8 +38,8 @@ const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
   in a particular direction. Since each direction have the same sum(x^2) term,
   that term is never computed. See Section 2, step 2, of:
   http://jmvalin.ca/notes/intra_paint.pdf */
-static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
-                        int coeff_shift) {
+int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var,
+                   int coeff_shift) {
  int i;
  int32_t cost[8] = { 0 };
  int partial[8][15] = { { 0 } };
@@ -273,9 +269,8 @@ static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
  }
 }

-void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
-               const od_dering_in *x, int xstride, int nhb, int nvb, int sbx,
-               int sby, int nhsb, int nvsb, int xdec,
+void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+               int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
               unsigned char *bskip, int skip_stride, int threshold,
               int coeff_shift) {
@@ -289,6 +284,12 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
  int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
  int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
  int thresh2[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+  od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
+    od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
+  };
+  od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES] = {
+    od_filter_dering_orthogonal_4x4, od_filter_dering_orthogonal_8x8
+  };
  bsize = 3 - xdec;
  in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER;
  /* We avoid filtering the pixels for which some of the pixels to average
@@ -340,7 +341,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
         to be a little bit more aggressive on pure horizontal/vertical
         since the ringing there tends to be directional, so it doesn't
         get removed by the directional filtering. */
-      thresh2[by][bx] = (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+      thresh2[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh[by][bx],
          dir[by][bx]);
@@ -354,7 +355,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
  for (by = 0; by < nvb; by++) {
    for (bx = 0; bx < nhb; bx++) {
      if (thresh[by][bx] == 0) continue;
-      (vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+      (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh2[by][bx],
          dir[by][bx]);
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -34,27 +34,11 @@ typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride,
 typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
                                                 const int16_t *in,
                                                 int threshold, int dir);
-
-struct od_dering_opt_vtbl {
-  od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES];
-  od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES];
-};
-typedef struct od_dering_opt_vtbl od_dering_opt_vtbl;
-
-void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
-               const od_dering_in *x, int xstride, int nvb, int nhb, int sbx,
-               int sby, int nhsb, int nvsb, int xdec,
+void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+               int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
               unsigned char *bskip, int skip_stride, int threshold,
               int coeff_shift);
-void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
-                                  int ln, int threshold, int dir);
-void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
-                                   const od_dering_in *x, int xstride, int ln,
-                                   int threshold, int dir);
-
-extern const od_dering_opt_vtbl OD_DERING_VTBL_C;
-
 int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
                                     int threshold, int dir);
 int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
@@ -65,5 +49,4 @@ void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
                                       const int16_t *in, int threshold,
                                       int dir);
-
 #endif
--- a/av1/common/x86/od_dering_sse4.c
+++ b/av1/common/x86/od_dering_sse4.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/x86/od_dering_sse4.h"
+
+/* partial A is a 16-bit vector of the form:
+   [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+   [0  y1 y2 y3 y4 y5 y6 y7].
+   This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+   (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+   and const2. */
+static INLINE __m128i fold_mul_and_sum(__m128i partiala, __m128i partialb,
+                                       __m128i const1, __m128i const2) {
+  __m128i tmp;
+  /* Reverse partial B. */
+  partialb = _mm_shuffle_epi8(
+      partialb,
+      _mm_set_epi8(15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12));
+  /* Interleave the x and y values of identical indices and pair x8 with 0. */
+  tmp = partiala;
+  partiala = _mm_unpacklo_epi16(partiala, partialb);
+  partialb = _mm_unpackhi_epi16(tmp, partialb);
+  /* Square and add the corresponding x and y values. */
+  partiala = _mm_madd_epi16(partiala, partiala);
+  partialb = _mm_madd_epi16(partialb, partialb);
+  /* Multiply by constant. */
+  partiala = _mm_mullo_epi32(partiala, const1);
+  partialb = _mm_mullo_epi32(partialb, const2);
+  /* Sum all results. */
+  partiala = _mm_add_epi32(partiala, partialb);
+  return partiala;
+}
+
+static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
+  __m128i t0, t1, t2, t3;
+  t0 = _mm_unpacklo_epi32(x0, x1);
+  t1 = _mm_unpacklo_epi32(x2, x3);
+  t2 = _mm_unpackhi_epi32(x0, x1);
+  t3 = _mm_unpackhi_epi32(x2, x3);
+  x0 = _mm_unpacklo_epi64(t0, t1);
+  x1 = _mm_unpackhi_epi64(t0, t1);
+  x2 = _mm_unpacklo_epi64(t2, t3);
+  x3 = _mm_unpackhi_epi64(t2, t3);
+  return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
+}
+
+/* Horizontal sum of 8x16-bit unsigned values. */
+static INLINE int32_t hsum_epi16(__m128i a) {
+  a = _mm_madd_epi16(a, _mm_set1_epi16(1));
+  a = _mm_hadd_epi32(a, a);
+  a = _mm_hadd_epi32(a, a);
+  return _mm_cvtsi128_si32(a);
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+   to compute the remaining directions. */
+static INLINE __m128i compute_directions(__m128i lines[8],
+                                         int32_t tmp_cost1[4]) {
+  __m128i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+  __m128i partial6;
+  __m128i tmp;
+  /* Partial sums for lines 0 and 1. */
+  partial4a = _mm_slli_si128(lines[0], 14);
+  partial4b = _mm_srli_si128(lines[0], 2);
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[1], 12));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[1], 4));
+  tmp = _mm_add_epi16(lines[0], lines[1]);
+  partial5a = _mm_slli_si128(tmp, 10);
+  partial5b = _mm_srli_si128(tmp, 6);
+  partial7a = _mm_slli_si128(tmp, 4);
+  partial7b = _mm_srli_si128(tmp, 12);
+  partial6 = tmp;
+
+  /* Partial sums for lines 2 and 3. */
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[2], 10));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[2], 6));
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[3], 8));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[3], 8));
+  tmp = _mm_add_epi16(lines[2], lines[3]);
+  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 8));
+  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 8));
+  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 6));
+  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 10));
+  partial6 = _mm_add_epi16(partial6, tmp);
+
+  /* Partial sums for lines 4 and 5. */
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[4], 6));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[4], 10));
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[5], 4));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[5], 12));
+  tmp = _mm_add_epi16(lines[4], lines[5]);
+  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 6));
+  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 10));
+  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 8));
+  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 8));
+  partial6 = _mm_add_epi16(partial6, tmp);
+
+  /* Partial sums for lines 6 and 7. */
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[6], 2));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[6], 14));
+  partial4a = _mm_add_epi16(partial4a, lines[7]);
+  tmp = _mm_add_epi16(lines[6], lines[7]);
+  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 4));
+  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 12));
+  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 10));
+  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 6));
+  partial6 = _mm_add_epi16(partial6, tmp);
+
+  /* Compute costs in terms of partial sums. */
+  partial4a =
+      fold_mul_and_sum(partial4a, partial4b, _mm_set_epi32(210, 280, 420, 840),
+                       _mm_set_epi32(105, 120, 140, 168));
+  partial7a =
+      fold_mul_and_sum(partial7a, partial7b, _mm_set_epi32(210, 420, 0, 0),
+                       _mm_set_epi32(105, 105, 105, 140));
+  partial5a =
+      fold_mul_and_sum(partial5a, partial5b, _mm_set_epi32(210, 420, 0, 0),
+                       _mm_set_epi32(105, 105, 105, 140));
+  partial6 = _mm_madd_epi16(partial6, partial6);
+  partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
+
+  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+  _mm_storeu_si128((__m128i *)tmp_cost1, partial4a);
+  return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+   counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var,
+                        int coeff_shift) {
+  int i;
+  int32_t cost[8];
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  __m128i lines[8];
+  __m128i dir03, dir47;
+  __m128i max;
+  for (i = 0; i < 8; i++) {
+    lines[i] = _mm_loadu_si128((__m128i *)&img[i * stride]);
+    lines[i] = _mm_sub_epi16(_mm_srai_epi16(lines[i], coeff_shift),
+                             _mm_set1_epi16(128));
+  }
+
+  /* Compute "mostly vertical" directions. */
+  dir47 = compute_directions(lines, cost + 4);
+
+  array_reverse_transpose_8x8(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  dir03 = compute_directions(lines, cost);
+
+#if 1
+  max = _mm_max_epi32(dir03, dir47);
+  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
+  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
+  dir03 = _mm_and_si128(_mm_cmpeq_epi32(max, dir03),
+                        _mm_setr_epi32(-1, -2, -3, -4));
+  dir47 = _mm_and_si128(_mm_cmpeq_epi32(max, dir47),
+                        _mm_setr_epi32(-5, -6, -7, -8));
+  dir03 = _mm_max_epu32(dir03, dir47);
+  dir03 = _mm_max_epu32(dir03, _mm_unpackhi_epi64(dir03, dir03));
+  dir03 =
+      _mm_max_epu32(dir03, _mm_shufflelo_epi16(dir03, _MM_SHUFFLE(1, 0, 3, 2)));
+  dir03 = _mm_xor_si128(dir03, _mm_set1_epi32(0xFFFFFFFF));
+
+  best_dir = _mm_cvtsi128_si32(dir03);
+  best_cost = _mm_cvtsi128_si32(max);
+#else
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+#endif
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) {
+  return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold);
+}
+
+int od_filter_dering_direction_4x4_sse4_1(int16_t *y, int ystride,
+                                          const int16_t *in, int threshold,
+                                          int dir) {
+  int i;
+  __m128i sum;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i res;
+  __m128i tmp;
+  __m128i thresh;
+  __m128i total_abs;
+  int off1, off2;
+  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+  total_abs = _mm_setzero_si128();
+  thresh = _mm_set1_epi16(threshold);
+  for (i = 0; i < 4; i += 2) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off1]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 2);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off1]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 2);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off2]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off2]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*res = row + ((sum + 8) >> 4)*/
+    res = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
+    res = _mm_add_epi16(row, res);
+    _mm_storel_epi64((__m128i *)&y[i * ystride], res);
+    _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
+                     _mm_unpackhi_epi64(res, res));
+  }
+  return (hsum_epi16(total_abs) + 2) >> 2;
+}
+
+int od_filter_dering_direction_8x8_sse4_1(int16_t *y, int ystride,
+                                          const int16_t *in, int threshold,
+                                          int dir) {
+  int i;
+  __m128i sum;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i res;
+  __m128i thresh;
+  __m128i total_abs;
+  int off1, off2, off3;
+  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+  off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
+  total_abs = _mm_setzero_si128();
+  thresh = _mm_set1_epi16(threshold);
+  for (i = 0; i < 8; i++) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 1);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 1);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off3]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off3]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*res = row + ((sum + 8) >> 4)*/
+    res = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
+    res = _mm_add_epi16(row, res);
+    _mm_storeu_si128((__m128i *)&y[i * ystride], res);
+  }
+  return (hsum_epi16(total_abs) + 8) >> 4;
+}
+
+void od_filter_dering_orthogonal_4x4_sse4_1(int16_t *y, int ystride,
+                                            const int16_t *in, int threshold,
+                                            int dir) {
+  int i;
+  int offset;
+  __m128i res;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i sum;
+  __m128i tmp;
+  __m128i thresh;
+  thresh = _mm_set1_epi16(threshold);
+  if (dir > 0 && dir < 4)
+    offset = OD_FILT_BSTRIDE;
+  else
+    offset = 1;
+  for (i = 0; i < 4; i += 2) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
+
+    /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + offset]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + offset]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - offset]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - offset]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*row + ((5*sum + 8) >> 4)*/
+    res = _mm_mullo_epi16(sum, _mm_set1_epi16(5));
+    res = _mm_add_epi16(res, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    res = _mm_add_epi16(res, row);
+    _mm_storel_epi64((__m128i *)&y[i * ystride], res);
+    _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
+                     _mm_unpackhi_epi64(res, res));
+  }
+}
+
+void od_filter_dering_orthogonal_8x8_sse4_1(int16_t *y, int ystride,
+                                            const int16_t *in, int threshold,
+                                            int dir) {
+  int i;
+  int offset;
+  __m128i res;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i sum;
+  __m128i thresh;
+  thresh = _mm_set1_epi16(threshold);
+  if (dir > 0 && dir < 4)
+    offset = OD_FILT_BSTRIDE;
+  else
+    offset = 1;
+  for (i = 0; i < 8; i++) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
+
+    /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + 1 * offset]), row);
+    /*if (abs(p) < thresh) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - 1 * offset]), row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + 2 * offset]), row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - 2 * offset]), row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*row + ((3*sum + 8) >> 4)*/
+    res = _mm_mullo_epi16(sum, _mm_set1_epi16(3));
+    res = _mm_add_epi16(res, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    res = _mm_add_epi16(res, row);
+    _mm_storeu_si128((__m128i *)&y[i * ystride], res);
+  }
+}
--- a/av1/common/x86/od_dering_sse4.h
+++ b/av1/common/x86/od_dering_sse4.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/od_dering.h"
+#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
+#define AOM_COMMON_OD_DERING_X86_SSE4_H_
+#endif  // AOM_COMMON_OD_DERING_X86_SSE4_H_
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -108,7 +108,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
        int threshold;
        level = compute_level_from_index(best_level, gi);
        threshold = level << coeff_shift;
-        od_dering(&OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[0],
+        od_dering(dst, MAX_MIB_SIZE * bsize[0],
                  &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
                       sbc * bsize[0] * MAX_MIB_SIZE],
                  cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,