Merge "Unify subtract function used in VP8/9"

2015-07-07 20:42:18 +00:00
parent ea5450b280 0ede9f52b7
commit 9d251f9510
12 changed files with 24 additions and 847 deletions
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -1,123 +0,0 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
 typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch);
 namespace {
 class SubtractBlockTest : public ::testing::TestWithParam<SubtractBlockFunc> {
 public:
  virtual void TearDown() {
    libvpx_test::ClearSystemState();
  }
 };
 using libvpx_test::ACMRandom;
 TEST_P(SubtractBlockTest, SimpleSubtract) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  BLOCK be;
  BLOCKD bd;
  // in libvpx, this stride is always 16
  const int kDiffPredStride = 16;
  const int kSrcStride[] = {32, 16, 8, 4, 0};
  const int kBlockWidth = 4;
  const int kBlockHeight = 4;
  // Allocate... align to 16 for mmx/sse tests
  uint8_t *source = reinterpret_cast<uint8_t*>(
      vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source)));
  be.src_diff = reinterpret_cast<int16_t*>(
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff)));
  bd.predictor = reinterpret_cast<unsigned char*>(
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
  for (int i = 0; kSrcStride[i] > 0; ++i) {
    // start at block0
    be.src = 0;
    be.base_src = &source;
    be.src_stride = kSrcStride[i];
    // set difference
    int16_t *src_diff = be.src_diff;
    for (int r = 0; r < kBlockHeight; ++r) {
      for (int c = 0; c < kBlockWidth; ++c) {
        src_diff[c] = static_cast<int16_t>(0xa5a5u);
      }
      src_diff += kDiffPredStride;
    }
    // set destination
    uint8_t *base_src = *be.base_src;
    for (int r = 0; r < kBlockHeight; ++r) {
      for (int c = 0; c < kBlockWidth; ++c) {
        base_src[c] = rnd.Rand8();
      }
      base_src += be.src_stride;
    }
    // set predictor
    uint8_t *predictor = bd.predictor;
    for (int r = 0; r < kBlockHeight; ++r) {
      for (int c = 0; c < kBlockWidth; ++c) {
        predictor[c] = rnd.Rand8();
      }
      predictor += kDiffPredStride;
    }
    ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
    base_src = *be.base_src;
    src_diff = be.src_diff;
    predictor = bd.predictor;
    for (int r = 0; r < kBlockHeight; ++r) {
      for (int c = 0; c < kBlockWidth; ++c) {
        EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r
                                                             << ", c = " << c;
      }
      src_diff += kDiffPredStride;
      predictor += kDiffPredStride;
      base_src += be.src_stride;
    }
  }
  vpx_free(be.src_diff);
  vpx_free(source);
  vpx_free(bd.predictor);
 }
 INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
                        ::testing::Values(vp8_subtract_b_c));
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
                        ::testing::Values(vp8_subtract_b_neon));
 #endif
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
                        ::testing::Values(vp8_subtract_b_mmx));
 #endif
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest,
                        ::testing::Values(vp8_subtract_b_sse2));
 #endif
 }  // namespace
--- a/test/test.mk
+++ b/test/test.mk
@@ -104,7 +104,6 @@ endif
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -343,15 +343,6 @@ add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
 specialize qw/vp8_mbuverror mmx sse2/;
 $vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
 specialize qw/vp8_subtract_b mmx sse2 neon/;
 add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
 specialize qw/vp8_subtract_mby mmx sse2 neon/;
 add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
 specialize qw/vp8_subtract_mbuv mmx sse2 neon/;
 #
 # Motion search
 #
--- a/vp8/encoder/arm/neon/subtract_neon.c
+++ b/vp8/encoder/arm/neon/subtract_neon.c
@@ -1,154 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "vp8/encoder/block.h"
 void vp8_subtract_b_neon(
        BLOCK *be,
        BLOCKD *bd,
        int pitch) {
    unsigned char *src_ptr, *predictor;
    int src_stride;
    int16_t *src_diff;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    uint16x8_t q10u16, q11u16, q12u16, q13u16;
    src_ptr = *be->base_src + be->src;
    src_stride = be->src_stride;
    predictor = bd->predictor;
    d0u8 = vld1_u8(src_ptr);
    src_ptr += src_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += src_stride;
    d4u8 = vld1_u8(src_ptr);
    src_ptr += src_stride;
    d6u8 = vld1_u8(src_ptr);
    d1u8 = vld1_u8(predictor);
    predictor += pitch;
    d3u8 = vld1_u8(predictor);
    predictor += pitch;
    d5u8 = vld1_u8(predictor);
    predictor += pitch;
    d7u8 = vld1_u8(predictor);
    q10u16 = vsubl_u8(d0u8, d1u8);
    q11u16 = vsubl_u8(d2u8, d3u8);
    q12u16 = vsubl_u8(d4u8, d5u8);
    q13u16 = vsubl_u8(d6u8, d7u8);
    src_diff = be->src_diff;
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
    src_diff += pitch;
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
    src_diff += pitch;
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
    src_diff += pitch;
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
    return;
 }
 void vp8_subtract_mby_neon(
        int16_t *diff,
        unsigned char *src,
        int src_stride,
        unsigned char *pred,
        int pred_stride) {
    int i;
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
    uint16x8_t q8u16, q9u16, q10u16, q11u16;
    for (i = 0; i < 8; i++) {  // subtract_mby_loop
        q0u8 = vld1q_u8(src);
        src += src_stride;
        q2u8 = vld1q_u8(src);
        src += src_stride;
        q1u8 = vld1q_u8(pred);
        pred += pred_stride;
        q3u8 = vld1q_u8(pred);
        pred += pred_stride;
        q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
        q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
        q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
        q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
        vst1q_u16((uint16_t *)diff, q8u16);
        diff += 8;
        vst1q_u16((uint16_t *)diff, q9u16);
        diff += 8;
        vst1q_u16((uint16_t *)diff, q10u16);
        diff += 8;
        vst1q_u16((uint16_t *)diff, q11u16);
        diff += 8;
    }
    return;
 }
 void vp8_subtract_mbuv_neon(
        int16_t *diff,
        unsigned char *usrc,
        unsigned char *vsrc,
        int src_stride,
        unsigned char *upred,
        unsigned char *vpred,
        int pred_stride) {
    int i, j;
    unsigned char *src_ptr, *pred_ptr;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    uint16x8_t q8u16, q9u16, q10u16, q11u16;
    diff += 256;
    for (i = 0; i < 2; i++) {
        if (i == 0) {
            src_ptr = usrc;
            pred_ptr = upred;
        } else if (i == 1) {
            src_ptr = vsrc;
            pred_ptr = vpred;
        }
        for (j = 0; j < 2; j++) {
            d0u8 = vld1_u8(src_ptr);
            src_ptr += src_stride;
            d1u8 = vld1_u8(pred_ptr);
            pred_ptr += pred_stride;
            d2u8 = vld1_u8(src_ptr);
            src_ptr += src_stride;
            d3u8 = vld1_u8(pred_ptr);
            pred_ptr += pred_stride;
            d4u8 = vld1_u8(src_ptr);
            src_ptr += src_stride;
            d5u8 = vld1_u8(pred_ptr);
            pred_ptr += pred_stride;
            d6u8 = vld1_u8(src_ptr);
            src_ptr += src_stride;
            d7u8 = vld1_u8(pred_ptr);
            pred_ptr += pred_stride;
            q8u16  = vsubl_u8(d0u8, d1u8);
            q9u16  = vsubl_u8(d2u8, d3u8);
            q10u16 = vsubl_u8(d4u8, d5u8);
            q11u16 = vsubl_u8(d6u8, d7u8);
            vst1q_u16((uint16_t *)diff, q8u16);
            diff += 8;
            vst1q_u16((uint16_t *)diff, q9u16);
            diff += 8;
            vst1q_u16((uint16_t *)diff, q10u16);
            diff += 8;
            vst1q_u16((uint16_t *)diff, q11u16);
            diff += 8;
        }
    }
    return;
 }
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -19,80 +20,29 @@
 #include "vpx_mem/vpx_mem.h"
 #include "rdopt.h"
-// TODO(jingning,johannkoenig): use vpx_subtract_block to replace
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
-// codec specified vp9_subtract_ functions.
+  unsigned char *src_ptr = (*(be->base_src) + be->src);
-void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
+  short *diff_ptr = be->src_diff;
-{
+  unsigned char *pred_ptr = bd->predictor;
-    unsigned char *src_ptr = (*(be->base_src) + be->src);
+  int src_stride = be->src_stride;
    short *diff_ptr = be->src_diff;
    unsigned char *pred_ptr = bd->predictor;
    int src_stride = be->src_stride;
-    int r, c;
+  vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
-
+                     pred_ptr, pitch);
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
        }
        diff_ptr += pitch;
        pred_ptr += pitch;
        src_ptr  += src_stride;
    }
 }
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
                         int src_stride, unsigned char *upred,
-                         unsigned char *vpred, int pred_stride)
+                         unsigned char *vpred, int pred_stride) {
-{
+  short *udiff = diff + 256;
-    short *udiff = diff + 256;
+  short *vdiff = diff + 320;
    short *vdiff = diff + 320;
-    int r, c;
+  vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
-
+  vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
    for (r = 0; r < 8; r++)
    {
        for (c = 0; c < 8; c++)
        {
            udiff[c] = usrc[c] - upred[c];
        }
        udiff += 8;
        upred += pred_stride;
        usrc  += src_stride;
    }
    for (r = 0; r < 8; r++)
    {
        for (c = 0; c < 8; c++)
        {
            vdiff[c] = vsrc[c] - vpred[c];
        }
        vdiff += 8;
        vpred += pred_stride;
        vsrc  += src_stride;
    }
 }
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
-                        unsigned char *pred, int pred_stride)
+                      unsigned char *pred, int pred_stride) {
-{
+  vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
    int r, c;
    for (r = 0; r < 16; r++)
    {
        for (c = 0; c < 16; c++)
        {
            diff[c] = src[c] - pred[c];
        }
        diff += 16;
        pred += pred_stride;
        src  += src_stride;
    }
 }
 static void vp8_subtract_mb(MACROBLOCK *x)
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -19,6 +19,13 @@ extern "C" {
 #endif
 void vp8_encode_inter16x16(MACROBLOCK *x);
 void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
 void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
                       int src_stride, unsigned char *upred,
                       unsigned char *vpred, int pred_stride);
 void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
                      unsigned char *pred, int pred_stride);
 void vp8_build_dcblock(MACROBLOCK *b);
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -1,223 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
 ;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
 global sym(vp8_subtract_b_mmx_impl) PRIVATE
 sym(vp8_subtract_b_mmx_impl):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push rsi
    push rdi
    ; end prolog
        mov     rdi,        arg(2) ;diff
        mov     rax,        arg(3) ;Predictor
        mov     rsi,        arg(0) ;z
        movsxd  rdx,        dword ptr arg(1);src_stride;
        movsxd  rcx,        dword ptr arg(4);pitch
        pxor    mm7,        mm7
        movd    mm0,        [rsi]
        movd    mm1,        [rax]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi],      mm0
        movd    mm0,        [rsi+rdx]
        movd    mm1,        [rax+rcx]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi+rcx*2],mm0
        movd    mm0,        [rsi+rdx*2]
        movd    mm1,        [rax+rcx*2]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi+rcx*4],        mm0
        lea     rsi,        [rsi+rdx*2]
        lea     rcx,        [rcx+rcx*2]
        movd    mm0,        [rsi+rdx]
        movd    mm1,        [rax+rcx]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi+rcx*2],        mm0
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
 ;unsigned char *pred, int pred_stride)
 global sym(vp8_subtract_mby_mmx) PRIVATE
 sym(vp8_subtract_mby_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push rsi
    push rdi
    ; end prolog
    mov         rdi,        arg(0)          ;diff
    mov         rsi,        arg(1)          ;src
    movsxd      rdx,        dword ptr arg(2);src_stride
    mov         rax,        arg(3)          ;pred
    push        rbx
    movsxd      rbx,        dword ptr arg(4);pred_stride
    pxor        mm0,        mm0
    mov         rcx,        16
 .submby_loop:
    movq        mm1,        [rsi]
    movq        mm3,        [rax]
    movq        mm2,        mm1
    movq        mm4,        mm3
    punpcklbw   mm1,        mm0
    punpcklbw   mm3,        mm0
    punpckhbw   mm2,        mm0
    punpckhbw   mm4,        mm0
    psubw       mm1,        mm3
    psubw       mm2,        mm4
    movq        [rdi],      mm1
    movq        [rdi+8],    mm2
    movq        mm1,        [rsi+8]
    movq        mm3,        [rax+8]
    movq        mm2,        mm1
    movq        mm4,        mm3
    punpcklbw   mm1,        mm0
    punpcklbw   mm3,        mm0
    punpckhbw   mm2,        mm0
    punpckhbw   mm4,        mm0
    psubw       mm1,        mm3
    psubw       mm2,        mm4
    movq        [rdi+16],   mm1
    movq        [rdi+24],   mm2
    add         rdi,        32
    lea         rax,        [rax+rbx]
    lea         rsi,        [rsi+rdx]
    dec         rcx
    jnz         .submby_loop
    pop rbx
    pop rdi
    pop rsi
    ; begin epilog
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
 ;                         int src_stride, unsigned char *upred,
 ;                         unsigned char *vpred, int pred_stride)
 global sym(vp8_subtract_mbuv_mmx) PRIVATE
 sym(vp8_subtract_mbuv_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    push rsi
    push rdi
    ; end prolog
    mov         rdi,        arg(0)          ;diff
    mov         rsi,        arg(1)          ;usrc
    movsxd      rdx,        dword ptr arg(3);src_stride;
    mov         rax,        arg(4)          ;upred
    add         rdi,        256*2           ;diff = diff + 256 (shorts)
    mov         rcx,        8
    push        rbx
    movsxd      rbx,        dword ptr arg(6);pred_stride
    pxor        mm7,        mm7
 .submbu_loop:
    movq        mm0,        [rsi]
    movq        mm1,        [rax]
    movq        mm3,        mm0
    movq        mm4,        mm1
    punpcklbw   mm0,        mm7
    punpcklbw   mm1,        mm7
    punpckhbw   mm3,        mm7
    punpckhbw   mm4,        mm7
    psubw       mm0,        mm1
    psubw       mm3,        mm4
    movq        [rdi],      mm0
    movq        [rdi+8],    mm3
    add         rdi, 16
    add         rsi, rdx
    add         rax, rbx
    dec         rcx
    jnz         .submbu_loop
    mov         rsi,        arg(2)          ;vsrc
    mov         rax,        arg(5)          ;vpred
    mov         rcx,        8
 .submbv_loop:
    movq        mm0,        [rsi]
    movq        mm1,        [rax]
    movq        mm3,        mm0
    movq        mm4,        mm1
    punpcklbw   mm0,        mm7
    punpcklbw   mm1,        mm7
    punpckhbw   mm3,        mm7
    punpckhbw   mm4,        mm7
    psubw       mm0,        mm1
    psubw       mm3,        mm4
    movq        [rdi],      mm0
    movq        [rdi+8],    mm3
    add         rdi, 16
    add         rsi, rdx
    add         rax, rbx
    dec         rcx
    jnz         .submbv_loop
    pop         rbx
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -1,245 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
 ;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
 global sym(vp8_subtract_b_sse2_impl) PRIVATE
 sym(vp8_subtract_b_sse2_impl):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push rsi
    push rdi
    ; end prolog
        mov     rdi,        arg(2) ;diff
        mov     rax,        arg(3) ;Predictor
        mov     rsi,        arg(0) ;z
        movsxd  rdx,        dword ptr arg(1);src_stride;
        movsxd  rcx,        dword ptr arg(4);pitch
        pxor    mm7,        mm7
        movd    mm0,        [rsi]
        movd    mm1,        [rax]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    MMWORD PTR [rdi],      mm0
        movd    mm0,        [rsi+rdx]
        movd    mm1,        [rax+rcx]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    MMWORD PTR [rdi+rcx*2], mm0
        movd    mm0,        [rsi+rdx*2]
        movd    mm1,        [rax+rcx*2]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    MMWORD PTR [rdi+rcx*4], mm0
        lea     rsi,        [rsi+rdx*2]
        lea     rcx,        [rcx+rcx*2]
        movd    mm0,        [rsi+rdx]
        movd    mm1,        [rax+rcx]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    MMWORD PTR [rdi+rcx*2], mm0
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
 ;unsigned char *pred, int pred_stride)
 global sym(vp8_subtract_mby_sse2) PRIVATE
 sym(vp8_subtract_mby_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push rsi
    push rdi
    ; end prolog
    mov         rdi,        arg(0)          ;diff
    mov         rsi,        arg(1)          ;src
    movsxd      rdx,        dword ptr arg(2);src_stride
    mov         rax,        arg(3)          ;pred
    movdqa      xmm4,       [GLOBAL(t80)]
    push        rbx
    mov         rcx,        8               ; do two lines at one time
    movsxd      rbx,        dword ptr arg(4);pred_stride
 .submby_loop:
    movdqa      xmm0,       [rsi]           ; src
    movdqa      xmm1,       [rax]           ; pred
    movdqa      xmm2,       xmm0
    psubb       xmm0,       xmm1
    pxor        xmm1,       xmm4            ;convert to signed values
    pxor        xmm2,       xmm4
    pcmpgtb     xmm1,       xmm2            ; obtain sign information
    movdqa      xmm2,       xmm0
    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
    movdqa      xmm3,       [rsi + rdx]
    movdqa      xmm5,       [rax + rbx]
    lea         rsi,        [rsi+rdx*2]
    lea         rax,        [rax+rbx*2]
    movdqa      [rdi],      xmm0
    movdqa      [rdi +16],  xmm2
    movdqa      xmm1,       xmm3
    psubb       xmm3,       xmm5
    pxor        xmm5,       xmm4            ;convert to signed values
    pxor        xmm1,       xmm4
    pcmpgtb     xmm5,       xmm1            ; obtain sign information
    movdqa      xmm1,       xmm3
    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
    movdqa      [rdi +32],  xmm3
    movdqa      [rdi +48],  xmm1
    add         rdi,        64
    dec         rcx
    jnz         .submby_loop
    pop rbx
    pop rdi
    pop rsi
    ; begin epilog
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
 ;                         int src_stride, unsigned char *upred,
 ;                         unsigned char *vpred, int pred_stride)
 global sym(vp8_subtract_mbuv_sse2) PRIVATE
 sym(vp8_subtract_mbuv_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push rsi
    push rdi
    ; end prolog
    movdqa      xmm4,       [GLOBAL(t80)]
    mov         rdi,        arg(0)          ;diff
    mov         rsi,        arg(1)          ;usrc
    movsxd      rdx,        dword ptr arg(3);src_stride;
    mov         rax,        arg(4)          ;upred
    add         rdi,        256*2           ;diff = diff + 256 (shorts)
    mov         rcx,        4
    push        rbx
    movsxd      rbx,        dword ptr arg(6);pred_stride
    ;u
 .submbu_loop:
    movq        xmm0,       [rsi]           ; src
    movq        xmm2,       [rsi+rdx]       ; src -- next line
    movq        xmm1,       [rax]           ; pred
    movq        xmm3,       [rax+rbx]       ; pred -- next line
    lea         rsi,        [rsi + rdx*2]
    lea         rax,        [rax + rbx*2]
    punpcklqdq  xmm0,       xmm2
    punpcklqdq  xmm1,       xmm3
    movdqa      xmm2,       xmm0
    psubb       xmm0,       xmm1            ; subtraction with sign missed
    pxor        xmm1,       xmm4            ;convert to signed values
    pxor        xmm2,       xmm4
    pcmpgtb     xmm1,       xmm2            ; obtain sign information
    movdqa      xmm2,       xmm0
    movdqa      xmm3,       xmm1
    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
    movdqa      [rdi],      xmm0            ; store difference
    movdqa      [rdi +16],  xmm2            ; store difference
    add         rdi,        32
    sub         rcx, 1
    jnz         .submbu_loop
    mov         rsi,        arg(2)          ;vsrc
    mov         rax,        arg(5)          ;vpred
    mov         rcx,        4
    ;v
 .submbv_loop:
    movq        xmm0,       [rsi]           ; src
    movq        xmm2,       [rsi+rdx]       ; src -- next line
    movq        xmm1,       [rax]           ; pred
    movq        xmm3,       [rax+rbx]       ; pred -- next line
    lea         rsi,        [rsi + rdx*2]
    lea         rax,        [rax + rbx*2]
    punpcklqdq  xmm0,       xmm2
    punpcklqdq  xmm1,       xmm3
    movdqa      xmm2,       xmm0
    psubb       xmm0,       xmm1            ; subtraction with sign missed
    pxor        xmm1,       xmm4            ;convert to signed values
    pxor        xmm2,       xmm4
    pcmpgtb     xmm1,       xmm2            ; obtain sign information
    movdqa      xmm2,       xmm0
    movdqa      xmm3,       xmm1
    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
    movdqa      [rdi],      xmm0            ; store difference
    movdqa      [rdi +16],  xmm2            ; store difference
    add         rdi,        32
    sub         rcx, 1
    jnz         .submbv_loop
    pop         rbx
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 t80:
    times 16 db 0x80
--- a/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ b/vp8/encoder/x86/vp8_enc_stubs_mmx.c
@@ -65,14 +65,3 @@ int vp8_mbuverror_mmx(MACROBLOCK *mb)
    return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
 }
 void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
                             short *diff, unsigned char *predictor,
                             int pitch);
 void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 {
    unsigned char *z = *(be->base_src) + be->src;
    unsigned int  src_stride = be->src_stride;
    short *diff = &be->src_diff[0];
    unsigned char *predictor = &bd->predictor[0];
    vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
 }
--- a/vp8/encoder/x86/vp8_enc_stubs_sse2.c
+++ b/vp8/encoder/x86/vp8_enc_stubs_sse2.c
@@ -30,14 +30,3 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
    return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
 }
 void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
                             short *diff, unsigned char *predictor,
                             int pitch);
 void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
 {
    unsigned char *z = *(be->base_src) + be->src;
    unsigned int  src_stride = be->src_stride;
    short *diff = &be->src_diff[0];
    unsigned char *predictor = &bd->predictor[0];
    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
 }
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -82,7 +82,6 @@ VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
 endif
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
@@ -94,7 +93,6 @@ ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 endif
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -25,5 +25,4 @@ VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c