Hadamard transform based coding mode decision process

This commit uses Hadamard transform based rate-distortion cost estimate for rtc coding mode decision. It improves the compression performance of speed -6 for many hard clips at lower bit-rates. For example, 5.5% for jimredvga, 6.7% for mmmoving, 6.1% for niklas720p. This will introduce extra encoding cycle costs at this point. Change-Id: Iaf70634fa2417a705ee29f2456175b981db3d375
2015-03-23 10:02:42 -07:00 · 2015-03-23 10:02:42 -07:00 · 8c411f74e0
commit 8c411f74e0
parent ba13ff8501
4 changed files with 400 additions and 22 deletions
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -1109,6 +1109,15 @@ specialize qw/vp9_avg_8x8 sse2 neon/;
 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
 specialize qw/vp9_avg_4x4 sse2/;

+add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_8x8 sse2/;
+
+add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff";
+specialize qw/vp9_hadamard_16x16/;
+
+add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2/;
+
 add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
 specialize qw/vp9_int_pro_row sse2/;

--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@ -28,6 +28,87 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
  return (sum + 8) >> 4;
 }

+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+                        int16_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, coeff);
+    coeff += 8;
+    ++tmp_buf;
+  }
+}
+
+// In place 16x16 2D Hadamard transform
+void vp9_hadamard_16x16_c(int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = a0 + a1;
+    int16_t b1 = a0 - a1;
+    int16_t b2 = a2 + a3;
+    int16_t b3 = a2 - a3;
+
+    coeff[0]   = (b0 + b2) >> 1;
+    coeff[64]  = (b1 + b3) >> 1;
+    coeff[128] = (b0 - b2) >> 1;
+    coeff[192] = (b1 - b3) >> 1;
+
+    ++coeff;
+  }
+}
+
+int16_t vp9_satd_c(const int16_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i)
+    satd += abs(coeff[i]);
+
+  return (int16_t)satd;
+}
+
 // Integer projection onto row vectors.
 void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
                       const int ref_stride, const int height) {
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@ -20,9 +20,11 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"

+#include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@ -188,6 +190,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 cond_cost_list(cpi, cost_list),
                                 x->nmvjointcost, x->mvcost,
                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
+    *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
  }

  if (scaled_ref_frame) {
@ -198,7 +202,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
  return rv;
 }

-
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
                              MACROBLOCK *x, MACROBLOCKD *xd,
                              int *out_rate_sum, int64_t *out_dist_sum,
@ -312,6 +315,105 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
  *out_dist_sum += dist << 4;
 }

+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  int block = 0, r, c;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  unsigned int var_y, sse_y;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+#else
+  (void)cpi;
+#endif
+
+  vp9_subtract_plane(x, bsize, plane);
+
+  *skippable = 1;
+  *rate = 0;
+  *dist = 0;
+  *sse = 0;
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        int i, j;
+        const int16_t *src_diff;
+        int64_t this_sse;
+        txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
+        src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+        switch (tx_size) {
+          case TX_32X32:
+            vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
+            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                  p->round_fp, p->quant_fp, p->quant_shift,
+                                  qcoeff, dqcoeff, pd->dequant, eob,
+                                  scan_order->scan, scan_order->iscan);
+            break;
+          case TX_16X16:
+            vp9_fdct16x16(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_8X8:
+            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_4X4:
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          default:
+            assert(0);
+            break;
+        }
+
+        *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
+        *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+
+        *sse += (this_sse >> shift);
+        *skippable &= (*eob == 0);
+      }
+      block += step;
+    }
+  }
+
+  *rate <<= 8;
+  *rate *= 6;
+}
+
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
                               int *out_rate_sum, int64_t *out_dist_sum,
@ -518,7 +620,9 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  int i, j;
  int rate;
  int64_t dist;
-  unsigned int var_y, sse_y;
+  int64_t this_sse;
+  int is_skippable;
+
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  assert(plane == 0);
  (void) plane;
@ -533,8 +637,16 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                          x->skip_encode ? src_stride : dst_stride,
                          pd->dst.buf, dst_stride,
                          i, j, 0);
-  // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
-  model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+
+  // TODO(jingning): This needs further refactoring.
+  block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+            bsize_tx, tx_size);
+  x->skip_txfm[0] = is_skippable;
+  if (is_skippable)
+    rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+  else
+    rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+
  p->src.buf = src_buf_base;
  pd->dst.buf = dst_buf_base;
  args->rate += rate;
@ -602,10 +714,6 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
  *rd_cost = best_rdc;
 }

-static const int ref_frame_cost[MAX_REF_FRAMES] = {
-    1235, 229, 530, 615,
-};
-
 typedef struct {
  MV_REFERENCE_FRAME ref_frame;
  PREDICTION_MODE pred_mode;
@ -682,6 +790,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  int ref_frame_skip_mask = 0;
  int idx;
  int best_pred_sad = INT_MAX;
+  int ref_frame_cost[MAX_REF_FRAMES];
+  vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+  vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+  vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+  ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+  ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+      ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+  ref_frame_cost[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);

  if (reuse_inter_pred) {
    int i;
@ -773,6 +895,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    int mode_index;
    int i;
    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    int64_t this_sse;
+    int is_skippable;
+
    if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
      continue;

@ -935,17 +1060,40 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
              vp9_get_switchable_rate(cpi, xd) : 0;
    }

+    // TODO(jingning): disable color operations temporarily.
    // chroma component rate-distortion cost modeling
-    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
-      int uv_rate = 0;
-      int64_t uv_dist = 0;
-      if (x->color_sensitivity[0])
-        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
-      if (x->color_sensitivity[1])
-        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
-      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, &var_y, &sse_y);
-      this_rdc.rate += uv_rate;
-      this_rdc.dist += uv_dist;
+//    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+//      int uv_rate = 0;
+//      int64_t uv_dist = 0;
+//      if (x->color_sensitivity[0])
+//        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+//      if (x->color_sensitivity[1])
+//        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+//      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+//                         &var_y, &sse_y);
+//      this_rdc.rate += uv_rate;
+//      this_rdc.dist += uv_dist;
+//    }
+
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable, &this_sse,
+              0, bsize, mbmi->tx_size);
+    x->skip_txfm[0] = is_skippable;
+    if (is_skippable) {
+      this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    } else {
+      if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+          RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+        this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+      } else {
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+        this_rdc.dist = this_sse;
+      }
+    }
+
+    if (cm->interp_filter == SWITCHABLE) {
+      if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+        this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
    }

    this_rdc.rate += rate_mv;
@ -1042,6 +1190,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      const PREDICTION_MODE this_mode = intra_mode_list[i];
      if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
        continue;
+      mbmi->mode = this_mode;
+      mbmi->ref_frame[0] = INTRA_FRAME;
      args.mode = this_mode;
      args.rate = 0;
      args.dist = 0;
@ -1058,17 +1208,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

      if (this_rdc.rdcost < best_rdc.rdcost) {
        best_rdc = this_rdc;
-        mbmi->mode = this_mode;
+        best_mode = this_mode;
        best_intra_tx_size = mbmi->tx_size;
-        mbmi->ref_frame[0] = INTRA_FRAME;
+        best_ref_frame = INTRA_FRAME;
        mbmi->uv_mode = this_mode;
        mbmi->mv[0].as_int = INVALID_MV;
+        best_mode_skip_txfm = x->skip_txfm[0];
      }
    }

    // Reset mb_mode_info to the best inter mode.
-    if (mbmi->ref_frame[0] != INTRA_FRAME) {
-      x->skip_txfm[0] = best_mode_skip_txfm;
+    if (best_ref_frame != INTRA_FRAME) {
      mbmi->tx_size = best_tx_size;
    } else {
      mbmi->tx_size = best_intra_tx_size;
@ -1076,6 +1226,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  }

  pd->dst = orig_dst;
+  mbmi->mode = best_mode;
+  mbmi->ref_frame[0] = best_ref_frame;
+  x->skip_txfm[0] = best_mode_skip_txfm;

  if (reuse_inter_pred && best_pred != NULL) {
    if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@ -57,6 +57,141 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
  return (avg + 8) >> 4;
 }

+static void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b1 = _mm_add_epi16(a1, a5);
+    b2 = _mm_add_epi16(a2, a6);
+    b3 = _mm_add_epi16(a3, a7);
+    b4 = _mm_sub_epi16(a0, a4);
+    b5 = _mm_sub_epi16(a1, a5);
+    b6 = _mm_sub_epi16(a2, a6);
+    b7 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[7] = _mm_unpackhi_epi64(b0, b1);
+    in[3] = _mm_unpacklo_epi64(b2, b3);
+    in[4] = _mm_unpackhi_epi64(b2, b3);
+    in[2] = _mm_unpacklo_epi64(b4, b5);
+    in[6] = _mm_unpackhi_epi64(b4, b5);
+    in[1] = _mm_unpacklo_epi64(b6, b7);
+    in[5] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+                           int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_storeu_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_storeu_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_storeu_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_storeu_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_storeu_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_storeu_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_storeu_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_storeu_si128((__m128i *)coeff, src[7]);
+}
+
+int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+  int i;
+  __m128i sum = _mm_load_si128((const __m128i *)coeff);
+  __m128i sign = _mm_srai_epi16(sum, 15);
+  __m128i val = _mm_xor_si128(sum, sign);
+  sum = _mm_sub_epi16(val, sign);
+  coeff += 8;
+
+  for (i = 8; i < length; i += 8) {
+    __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    sign = _mm_srai_epi16(src_line, 15);
+    val = _mm_xor_si128(src_line, sign);
+    val = _mm_sub_epi16(val, sign);
+    sum = _mm_add_epi16(sum, val);
+    coeff += 8;
+  }
+
+  val = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, val);
+
+  return _mm_extract_epi16(sum, 0);
+}
+
 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
                          const int ref_stride, const int height) {
  int idx;