From 432cd4bfb795534c8ba479fd735cc11dc8562469 Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Mon, 6 Jul 2015 09:33:27 -0700
Subject: [PATCH] Move subtract functions from vp9 to vpx_dsp

Factor out the subtraction operator as common function.

Change-Id: I526e703477c6a290e0e3e3c8898f8bb1ca82779b
---
 test/vp9_subtract_test.cc                     |   7 +-
 vp8/encoder/encodemb.c                        |   2 +
 vp9/common/vp9_rtcd_defs.pl                   |   6 -
 vp9/encoder/vp9_encodemb.c                    |  60 +-----
 vp9/encoder/vp9_rdopt.c                       |  11 +-
 vp9/vp9cx.mk                                  |   3 -
 .../arm/subtract_neon.c                       |   5 +-
 vpx_dsp/mips/macros_msa.h                     | 193 ++++++++++++++++++
 .../mips/subtract_msa.c                       |  10 +-
 vpx_dsp/subtract.c                            |  56 +++++
 vpx_dsp/vpx_dsp.mk                            |   4 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl                  |  12 ++
 .../x86/subtract_sse2.asm                     |   3 +-
 13 files changed, 297 insertions(+), 75 deletions(-)
 rename vp9/encoder/arm/neon/vp9_subtract_neon.c => vpx_dsp/arm/subtract_neon.c (97%)
 rename vp9/encoder/mips/msa/vp9_subtract_msa.c => vpx_dsp/mips/subtract_msa.c (97%)
 create mode 100644 vpx_dsp/subtract.c
 rename vp9/encoder/x86/vp9_subtract_sse2.asm => vpx_dsp/x86/subtract_sse2.asm (98%)

diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index fabb43824..a1798d706 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -14,6 +14,7 @@
 #include "test/register_state_check.h"
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -89,15 +90,15 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
 }
 
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_c));
+                        ::testing::Values(vpx_subtract_block_c));
 
 #if HAVE_SSE2 && CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_sse2));
+                        ::testing::Values(vpx_subtract_block_sse2));
 #endif
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_neon));
+                        ::testing::Values(vpx_subtract_block_neon));
 #endif
 
 }  // namespace vp9
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index dfd0a237a..820b1376b 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -19,6 +19,8 @@
 #include "vpx_mem/vpx_mem.h"
 #include "rdopt.h"
 
+// TODO(jingning,johannkoenig): use vpx_subtract_block to replace
+// codec specified vp9_subtract_ functions.
 void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
 {
     unsigned char *src_ptr = (*(be->base_src) + be->src);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d8c7e1447..e3ab06d93 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -922,9 +922,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
 # ENCODEMB INVOKE
 
-add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vp9_subtract_block neon msa/, "$sse2_x86inc";
-
 #
 # Denoiser
 #
@@ -1328,9 +1325,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error sse2/;
 
-  add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vp9_highbd_subtract_block/;
-
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp/;
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 2829365e5..313094140 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -11,6 +11,7 @@
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -31,45 +32,6 @@ struct optimize_ctx {
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
 };
 
-void vp9_subtract_block_c(int rows, int cols,
-                          int16_t *diff, ptrdiff_t diff_stride,
-                          const uint8_t *src, ptrdiff_t src_stride,
-                          const uint8_t *pred, ptrdiff_t pred_stride) {
-  int r, c;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++)
-      diff[c] = src[c] - pred[c];
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src  += src_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_subtract_block_c(int rows, int cols,
-                                 int16_t *diff, ptrdiff_t diff_stride,
-                                 const uint8_t *src8, ptrdiff_t src_stride,
-                                 const uint8_t *pred8, ptrdiff_t pred_stride,
-                                 int bd) {
-  int r, c;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  (void) bd;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) {
-      diff[c] = src[c] - pred[c];
-    }
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src  += src_stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -79,13 +41,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
                               p->src.stride, pd->dst.buf, pd->dst.stride,
                               x->e_mbd.bd);
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
 }
 
@@ -838,7 +800,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(32, 32, src_diff, diff_stride,
+          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
@@ -859,7 +821,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(16, 16, src_diff, diff_stride,
+          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
@@ -881,7 +843,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(8, 8, src_diff, diff_stride,
+          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
@@ -904,7 +866,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                 dst, dst_stride, i, j, plane);
 
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(4, 4, src_diff, diff_stride,
+          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
@@ -946,7 +908,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(32, 32, src_diff, diff_stride,
+        vpx_subtract_block(32, 32, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
@@ -966,7 +928,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(16, 16, src_diff, diff_stride,
+        vpx_subtract_block(16, 16, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
@@ -986,7 +948,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(8, 8, src_diff, diff_stride,
+        vpx_subtract_block(8, 8, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
@@ -1007,7 +969,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               dst, dst_stride, i, j, plane);
 
       if (!x->skip_recode) {
-        vp9_subtract_block(4, 4, src_diff, diff_stride,
+        vpx_subtract_block(4, 4, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e6b7f193a..dc054f0c1 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -12,6 +12,7 @@
 #include <math.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -832,7 +833,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                   x->skip_encode ? src : dst,
                                   x->skip_encode ? src_stride : dst_stride,
                                   dst, dst_stride, idx, idy, 0);
-          vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
+          vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
                                     dst, dst_stride, xd->bd);
           if (xd->lossless) {
             const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -932,7 +933,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, idx, idy, 0);
-        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+        vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -1394,16 +1395,16 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_subtract_block(
+    vpx_highbd_subtract_block(
         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
         8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
   } else {
-    vp9_subtract_block(
+    vpx_subtract_block(
         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
         8, src, p->src.stride, dst, pd->dst.stride);
   }
 #else
-  vp9_subtract_block(height, width,
+  vpx_subtract_block(height, width,
                      vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
                      8, src, p->src.stride, dst, pd->dst.stride);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 5a039accc..f670d82e7 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -114,7 +114,6 @@ endif
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -151,7 +150,6 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 endif
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
@@ -161,7 +159,6 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
similarity index 97%
rename from vp9/encoder/arm/neon/vp9_subtract_neon.c
rename to vpx_dsp/arm/subtract_neon.c
index b4bf567db..7b146095e 100644
--- a/vp9/encoder/arm/neon/vp9_subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -9,12 +9,11 @@
  */
 
 #include <arm_neon.h>
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
 
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_subtract_block_neon(int rows, int cols,
+void vpx_subtract_block_neon(int rows, int cols,
                              int16_t *diff, ptrdiff_t diff_stride,
                              const uint8_t *src, ptrdiff_t src_stride,
                              const uint8_t *pred, ptrdiff_t pred_stride) {
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index 30e152bc1..8410f4bbd 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -24,6 +24,9 @@
 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
 
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
 #if (__mips_isa_rev >= 6)
 #define LW(psrc) ({                                 \
   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
@@ -38,6 +41,61 @@
                                                     \
   val_m;                                            \
 })
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ld  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m);                                     \
+  val1_m = LW(psrc_m + 4);                                 \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SW(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint32_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sw  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+
+#define SD(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint64_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sd  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
 #else  // !(__mips_isa_rev >= 6)
 #define LW(psrc) ({                                 \
   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
@@ -52,6 +110,60 @@
                                                     \
   val_m;                                            \
 })
+
+#define SW(val, pdst) {                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);   \
+  const uint32_t val_m = (val);          \
+                                         \
+  __asm__ __volatile__ (                 \
+      "usw  %[val_m],  %[pdst_m]  \n\t"  \
+                                         \
+      : [pdst_m] "=m" (*pdst_m)          \
+      : [val_m] "r" (val_m)              \
+  );                                     \
+}
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "uld  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m1);                                    \
+  val1_m = LW(psrc_m1 + 4);                                \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SD(val, pdst) {                                     \
+  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
+  uint32_t val0_m, val1_m;                                  \
+                                                            \
+  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                            \
+  SW(val0_m, pdst_m1);                                      \
+  SW(val1_m, pdst_m1 + 4);                                  \
+}
 #endif  // (__mips_isa_rev >= 6)
 
 /* Description : Load 4 words with stride
@@ -69,6 +181,21 @@
   out3 = LW((psrc) + 3 * stride);                    \
 }
 
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) {  \
+  out0 = LD((psrc));                     \
+  out1 = LD((psrc) + stride);            \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) {  \
+  LD2((psrc), stride, out0, out1);                   \
+  LD2((psrc) + 2 * stride, stride, out2, out3);      \
+}
+
 /* Description : Load vectors with 16 byte elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
@@ -81,6 +208,7 @@
   out1 = LD_B(RTYPE, (psrc) + stride);            \
 }
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 
 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
   LD_B2(RTYPE, (psrc), stride, out0, out1);             \
@@ -93,6 +221,7 @@
   LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
 }
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 
 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
@@ -100,6 +229,14 @@
 }
 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
 
+#define LD_B8(RTYPE, psrc, stride,                                    \
+              out0, out1, out2, out3, out4, out5, out6, out7) {       \
+  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
 /* Description : Load vectors with 8 halfword elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
@@ -271,6 +408,13 @@
 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
 
+#define INSERT_D2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
 /* Description : Interleave both left and right half of input vectors
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -328,4 +472,53 @@
   tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
   ILVRL_H2_SW(tmp_m, in, out0, out1);    \
 }
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) {  \
+  SD(in0, (pdst))                                \
+  SD(in1, (pdst) + stride);                      \
+  SD(in2, (pdst) + 2 * stride);                  \
+  SD(in3, (pdst) + 3 * stride);                  \
+}
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_H(RTYPE, in0, (pdst));                     \
+  ST_H(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) {                  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
+  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
+                                                            \
+  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
+  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
+  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
+  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
+                                                            \
+  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
+}
 #endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
diff --git a/vp9/encoder/mips/msa/vp9_subtract_msa.c b/vpx_dsp/mips/subtract_msa.c
similarity index 97%
rename from vp9/encoder/mips/msa/vp9_subtract_msa.c
rename to vpx_dsp/mips/subtract_msa.c
index 1b8b694ce..9ac43c5cd 100644
--- a/vp9/encoder/mips/msa/vp9_subtract_msa.c
+++ b/vpx_dsp/mips/subtract_msa.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
                             const uint8_t *pred_ptr, int32_t pred_stride,
@@ -226,7 +226,7 @@ static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-void vp9_subtract_block_msa(int32_t rows, int32_t cols,
+void vpx_subtract_block_msa(int32_t rows, int32_t cols,
                             int16_t *diff_ptr, ptrdiff_t diff_stride,
                             const uint8_t *src_ptr, ptrdiff_t src_stride,
                             const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
@@ -253,12 +253,12 @@ void vp9_subtract_block_msa(int32_t rows, int32_t cols,
                           diff_ptr, diff_stride);
         break;
       default:
-        vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
                              src_stride, pred_ptr, pred_stride);
         break;
     }
   } else {
-    vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
                          pred_ptr, pred_stride);
   }
 }
diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c
new file mode 100644
index 000000000..556e0134f
--- /dev/null
+++ b/vpx_dsp/subtract.c
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_subtract_block_c(int rows, int cols,
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
+  int r, c;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++)
+      diff[c] = src[c] - pred[c];
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_c(int rows, int cols,
+                                 int16_t *diff, ptrdiff_t diff_stride,
+                                 const uint8_t *src8, ptrdiff_t src_stride,
+                                 const uint8_t *pred8, ptrdiff_t pred_stride,
+                                 int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void) bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index d8ab10822..ecfaf4e01 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -12,13 +12,16 @@ DSP_SRCS-yes += vpx_dsp.mk
 
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
+DSP_SRCS-yes            += subtract.c
 
 DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
 DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
@@ -30,6 +33,7 @@ DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f12270c1c..8130eb782 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -36,6 +36,12 @@ if ($opts{arch} eq "x86_64") {
 }
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
+#
+# Block subtraction
+#
+add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+
 #
 # Single block SAD
 #
@@ -210,6 +216,12 @@ add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const
 specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Block subtraction
+  #
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block/;
+
   #
   # Single block SAD
   #
diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
similarity index 98%
rename from vp9/encoder/x86/vp9_subtract_sse2.asm
rename to vpx_dsp/x86/subtract_sse2.asm
index 982408083..a18645112 100644
--- a/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -7,12 +7,13 @@
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
+%define program_name vpx
 
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION .text
 
-; void vp9_subtract_block(int rows, int cols,
+; void vpx_subtract_block(int rows, int cols,
 ;                         int16_t *diff, ptrdiff_t diff_stride,
 ;                         const uint8_t *src, ptrdiff_t src_stride,
 ;                         const uint8_t *pred, ptrdiff_t pred_stride)