Merge "Unify subtract function used in VP8/9"
This commit is contained in:
		@@ -1,123 +0,0 @@
 | 
			
		||||
/*
 | 
			
		||||
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 | 
			
		||||
 *
 | 
			
		||||
 *  Use of this source code is governed by a BSD-style license
 | 
			
		||||
 *  that can be found in the LICENSE file in the root of the source
 | 
			
		||||
 *  tree. An additional intellectual property rights grant can be found
 | 
			
		||||
 *  in the file PATENTS.  All contributing project authors may
 | 
			
		||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "third_party/googletest/src/include/gtest/gtest.h"
 | 
			
		||||
#include "test/acm_random.h"
 | 
			
		||||
#include "test/clear_system_state.h"
 | 
			
		||||
#include "test/register_state_check.h"
 | 
			
		||||
#include "./vpx_config.h"
 | 
			
		||||
#include "./vp8_rtcd.h"
 | 
			
		||||
#include "vp8/common/blockd.h"
 | 
			
		||||
#include "vp8/encoder/block.h"
 | 
			
		||||
#include "vpx_mem/vpx_mem.h"
 | 
			
		||||
 | 
			
		||||
typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch);
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
class SubtractBlockTest : public ::testing::TestWithParam<SubtractBlockFunc> {
 | 
			
		||||
 public:
 | 
			
		||||
  virtual void TearDown() {
 | 
			
		||||
    libvpx_test::ClearSystemState();
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
using libvpx_test::ACMRandom;
 | 
			
		||||
 | 
			
		||||
TEST_P(SubtractBlockTest, SimpleSubtract) {
 | 
			
		||||
  ACMRandom rnd(ACMRandom::DeterministicSeed());
 | 
			
		||||
  BLOCK be;
 | 
			
		||||
  BLOCKD bd;
 | 
			
		||||
  // in libvpx, this stride is always 16
 | 
			
		||||
  const int kDiffPredStride = 16;
 | 
			
		||||
  const int kSrcStride[] = {32, 16, 8, 4, 0};
 | 
			
		||||
  const int kBlockWidth = 4;
 | 
			
		||||
  const int kBlockHeight = 4;
 | 
			
		||||
 | 
			
		||||
  // Allocate... align to 16 for mmx/sse tests
 | 
			
		||||
  uint8_t *source = reinterpret_cast<uint8_t*>(
 | 
			
		||||
      vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source)));
 | 
			
		||||
  be.src_diff = reinterpret_cast<int16_t*>(
 | 
			
		||||
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff)));
 | 
			
		||||
  bd.predictor = reinterpret_cast<unsigned char*>(
 | 
			
		||||
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; kSrcStride[i] > 0; ++i) {
 | 
			
		||||
    // start at block0
 | 
			
		||||
    be.src = 0;
 | 
			
		||||
    be.base_src = &source;
 | 
			
		||||
    be.src_stride = kSrcStride[i];
 | 
			
		||||
 | 
			
		||||
    // set difference
 | 
			
		||||
    int16_t *src_diff = be.src_diff;
 | 
			
		||||
    for (int r = 0; r < kBlockHeight; ++r) {
 | 
			
		||||
      for (int c = 0; c < kBlockWidth; ++c) {
 | 
			
		||||
        src_diff[c] = static_cast<int16_t>(0xa5a5u);
 | 
			
		||||
      }
 | 
			
		||||
      src_diff += kDiffPredStride;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // set destination
 | 
			
		||||
    uint8_t *base_src = *be.base_src;
 | 
			
		||||
    for (int r = 0; r < kBlockHeight; ++r) {
 | 
			
		||||
      for (int c = 0; c < kBlockWidth; ++c) {
 | 
			
		||||
        base_src[c] = rnd.Rand8();
 | 
			
		||||
      }
 | 
			
		||||
      base_src += be.src_stride;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // set predictor
 | 
			
		||||
    uint8_t *predictor = bd.predictor;
 | 
			
		||||
    for (int r = 0; r < kBlockHeight; ++r) {
 | 
			
		||||
      for (int c = 0; c < kBlockWidth; ++c) {
 | 
			
		||||
        predictor[c] = rnd.Rand8();
 | 
			
		||||
      }
 | 
			
		||||
      predictor += kDiffPredStride;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
 | 
			
		||||
 | 
			
		||||
    base_src = *be.base_src;
 | 
			
		||||
    src_diff = be.src_diff;
 | 
			
		||||
    predictor = bd.predictor;
 | 
			
		||||
    for (int r = 0; r < kBlockHeight; ++r) {
 | 
			
		||||
      for (int c = 0; c < kBlockWidth; ++c) {
 | 
			
		||||
        EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r
 | 
			
		||||
                                                             << ", c = " << c;
 | 
			
		||||
      }
 | 
			
		||||
      src_diff += kDiffPredStride;
 | 
			
		||||
      predictor += kDiffPredStride;
 | 
			
		||||
      base_src += be.src_stride;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  vpx_free(be.src_diff);
 | 
			
		||||
  vpx_free(source);
 | 
			
		||||
  vpx_free(bd.predictor);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
 | 
			
		||||
                        ::testing::Values(vp8_subtract_b_c));
 | 
			
		||||
 | 
			
		||||
#if HAVE_NEON
 | 
			
		||||
INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
 | 
			
		||||
                        ::testing::Values(vp8_subtract_b_neon));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if HAVE_MMX
 | 
			
		||||
INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
 | 
			
		||||
                        ::testing::Values(vp8_subtract_b_mmx));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if HAVE_SSE2
 | 
			
		||||
INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest,
 | 
			
		||||
                        ::testing::Values(vp8_subtract_b_sse2));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
}  // namespace
 | 
			
		||||
@@ -104,7 +104,6 @@ endif
 | 
			
		||||
LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 | 
			
		||||
LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 | 
			
		||||
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 | 
			
		||||
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
 | 
			
		||||
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 | 
			
		||||
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 | 
			
		||||
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 | 
			
		||||
 
 | 
			
		||||
@@ -343,15 +343,6 @@ add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
 | 
			
		||||
specialize qw/vp8_mbuverror mmx sse2/;
 | 
			
		||||
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 | 
			
		||||
 | 
			
		||||
add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
 | 
			
		||||
specialize qw/vp8_subtract_b mmx sse2 neon/;
 | 
			
		||||
 | 
			
		||||
add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
 | 
			
		||||
specialize qw/vp8_subtract_mby mmx sse2 neon/;
 | 
			
		||||
 | 
			
		||||
add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
 | 
			
		||||
specialize qw/vp8_subtract_mbuv mmx sse2 neon/;
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Motion search
 | 
			
		||||
#
 | 
			
		||||
 
 | 
			
		||||
@@ -1,154 +0,0 @@
 | 
			
		||||
/*
 | 
			
		||||
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 | 
			
		||||
 *
 | 
			
		||||
 *  Use of this source code is governed by a BSD-style license
 | 
			
		||||
 *  that can be found in the LICENSE file in the root of the source
 | 
			
		||||
 *  tree. An additional intellectual property rights grant can be found
 | 
			
		||||
 *  in the file PATENTS.  All contributing project authors may
 | 
			
		||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <arm_neon.h>
 | 
			
		||||
#include "vp8/encoder/block.h"
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_b_neon(
 | 
			
		||||
        BLOCK *be,
 | 
			
		||||
        BLOCKD *bd,
 | 
			
		||||
        int pitch) {
 | 
			
		||||
    unsigned char *src_ptr, *predictor;
 | 
			
		||||
    int src_stride;
 | 
			
		||||
    int16_t *src_diff;
 | 
			
		||||
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
 | 
			
		||||
    uint16x8_t q10u16, q11u16, q12u16, q13u16;
 | 
			
		||||
 | 
			
		||||
    src_ptr = *be->base_src + be->src;
 | 
			
		||||
    src_stride = be->src_stride;
 | 
			
		||||
    predictor = bd->predictor;
 | 
			
		||||
 | 
			
		||||
    d0u8 = vld1_u8(src_ptr);
 | 
			
		||||
    src_ptr += src_stride;
 | 
			
		||||
    d2u8 = vld1_u8(src_ptr);
 | 
			
		||||
    src_ptr += src_stride;
 | 
			
		||||
    d4u8 = vld1_u8(src_ptr);
 | 
			
		||||
    src_ptr += src_stride;
 | 
			
		||||
    d6u8 = vld1_u8(src_ptr);
 | 
			
		||||
 | 
			
		||||
    d1u8 = vld1_u8(predictor);
 | 
			
		||||
    predictor += pitch;
 | 
			
		||||
    d3u8 = vld1_u8(predictor);
 | 
			
		||||
    predictor += pitch;
 | 
			
		||||
    d5u8 = vld1_u8(predictor);
 | 
			
		||||
    predictor += pitch;
 | 
			
		||||
    d7u8 = vld1_u8(predictor);
 | 
			
		||||
 | 
			
		||||
    q10u16 = vsubl_u8(d0u8, d1u8);
 | 
			
		||||
    q11u16 = vsubl_u8(d2u8, d3u8);
 | 
			
		||||
    q12u16 = vsubl_u8(d4u8, d5u8);
 | 
			
		||||
    q13u16 = vsubl_u8(d6u8, d7u8);
 | 
			
		||||
 | 
			
		||||
    src_diff = be->src_diff;
 | 
			
		||||
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
 | 
			
		||||
    src_diff += pitch;
 | 
			
		||||
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
 | 
			
		||||
    src_diff += pitch;
 | 
			
		||||
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
 | 
			
		||||
    src_diff += pitch;
 | 
			
		||||
    vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
 | 
			
		||||
    return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_mby_neon(
 | 
			
		||||
        int16_t *diff,
 | 
			
		||||
        unsigned char *src,
 | 
			
		||||
        int src_stride,
 | 
			
		||||
        unsigned char *pred,
 | 
			
		||||
        int pred_stride) {
 | 
			
		||||
    int i;
 | 
			
		||||
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
 | 
			
		||||
    uint16x8_t q8u16, q9u16, q10u16, q11u16;
 | 
			
		||||
 | 
			
		||||
    for (i = 0; i < 8; i++) {  // subtract_mby_loop
 | 
			
		||||
        q0u8 = vld1q_u8(src);
 | 
			
		||||
        src += src_stride;
 | 
			
		||||
        q2u8 = vld1q_u8(src);
 | 
			
		||||
        src += src_stride;
 | 
			
		||||
        q1u8 = vld1q_u8(pred);
 | 
			
		||||
        pred += pred_stride;
 | 
			
		||||
        q3u8 = vld1q_u8(pred);
 | 
			
		||||
        pred += pred_stride;
 | 
			
		||||
 | 
			
		||||
        q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
 | 
			
		||||
        q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
 | 
			
		||||
        q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
 | 
			
		||||
        q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
 | 
			
		||||
 | 
			
		||||
        vst1q_u16((uint16_t *)diff, q8u16);
 | 
			
		||||
        diff += 8;
 | 
			
		||||
        vst1q_u16((uint16_t *)diff, q9u16);
 | 
			
		||||
        diff += 8;
 | 
			
		||||
        vst1q_u16((uint16_t *)diff, q10u16);
 | 
			
		||||
        diff += 8;
 | 
			
		||||
        vst1q_u16((uint16_t *)diff, q11u16);
 | 
			
		||||
        diff += 8;
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_mbuv_neon(
 | 
			
		||||
        int16_t *diff,
 | 
			
		||||
        unsigned char *usrc,
 | 
			
		||||
        unsigned char *vsrc,
 | 
			
		||||
        int src_stride,
 | 
			
		||||
        unsigned char *upred,
 | 
			
		||||
        unsigned char *vpred,
 | 
			
		||||
        int pred_stride) {
 | 
			
		||||
    int i, j;
 | 
			
		||||
    unsigned char *src_ptr, *pred_ptr;
 | 
			
		||||
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
 | 
			
		||||
    uint16x8_t q8u16, q9u16, q10u16, q11u16;
 | 
			
		||||
 | 
			
		||||
    diff += 256;
 | 
			
		||||
    for (i = 0; i < 2; i++) {
 | 
			
		||||
        if (i == 0) {
 | 
			
		||||
            src_ptr = usrc;
 | 
			
		||||
            pred_ptr = upred;
 | 
			
		||||
        } else if (i == 1) {
 | 
			
		||||
            src_ptr = vsrc;
 | 
			
		||||
            pred_ptr = vpred;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (j = 0; j < 2; j++) {
 | 
			
		||||
            d0u8 = vld1_u8(src_ptr);
 | 
			
		||||
            src_ptr += src_stride;
 | 
			
		||||
            d1u8 = vld1_u8(pred_ptr);
 | 
			
		||||
            pred_ptr += pred_stride;
 | 
			
		||||
            d2u8 = vld1_u8(src_ptr);
 | 
			
		||||
            src_ptr += src_stride;
 | 
			
		||||
            d3u8 = vld1_u8(pred_ptr);
 | 
			
		||||
            pred_ptr += pred_stride;
 | 
			
		||||
            d4u8 = vld1_u8(src_ptr);
 | 
			
		||||
            src_ptr += src_stride;
 | 
			
		||||
            d5u8 = vld1_u8(pred_ptr);
 | 
			
		||||
            pred_ptr += pred_stride;
 | 
			
		||||
            d6u8 = vld1_u8(src_ptr);
 | 
			
		||||
            src_ptr += src_stride;
 | 
			
		||||
            d7u8 = vld1_u8(pred_ptr);
 | 
			
		||||
            pred_ptr += pred_stride;
 | 
			
		||||
 | 
			
		||||
            q8u16  = vsubl_u8(d0u8, d1u8);
 | 
			
		||||
            q9u16  = vsubl_u8(d2u8, d3u8);
 | 
			
		||||
            q10u16 = vsubl_u8(d4u8, d5u8);
 | 
			
		||||
            q11u16 = vsubl_u8(d6u8, d7u8);
 | 
			
		||||
 | 
			
		||||
            vst1q_u16((uint16_t *)diff, q8u16);
 | 
			
		||||
            diff += 8;
 | 
			
		||||
            vst1q_u16((uint16_t *)diff, q9u16);
 | 
			
		||||
            diff += 8;
 | 
			
		||||
            vst1q_u16((uint16_t *)diff, q10u16);
 | 
			
		||||
            diff += 8;
 | 
			
		||||
            vst1q_u16((uint16_t *)diff, q11u16);
 | 
			
		||||
            diff += 8;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
}
 | 
			
		||||
@@ -8,6 +8,7 @@
 | 
			
		||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
 | 
			
		||||
#include "vpx_config.h"
 | 
			
		||||
#include "vp8_rtcd.h"
 | 
			
		||||
@@ -19,80 +20,29 @@
 | 
			
		||||
#include "vpx_mem/vpx_mem.h"
 | 
			
		||||
#include "rdopt.h"
 | 
			
		||||
 | 
			
		||||
// TODO(jingning,johannkoenig): use vpx_subtract_block to replace
 | 
			
		||||
// codec specified vp9_subtract_ functions.
 | 
			
		||||
void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
 | 
			
		||||
{
 | 
			
		||||
    unsigned char *src_ptr = (*(be->base_src) + be->src);
 | 
			
		||||
    short *diff_ptr = be->src_diff;
 | 
			
		||||
    unsigned char *pred_ptr = bd->predictor;
 | 
			
		||||
    int src_stride = be->src_stride;
 | 
			
		||||
void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
 | 
			
		||||
  unsigned char *src_ptr = (*(be->base_src) + be->src);
 | 
			
		||||
  short *diff_ptr = be->src_diff;
 | 
			
		||||
  unsigned char *pred_ptr = bd->predictor;
 | 
			
		||||
  int src_stride = be->src_stride;
 | 
			
		||||
 | 
			
		||||
    int r, c;
 | 
			
		||||
 | 
			
		||||
    for (r = 0; r < 4; r++)
 | 
			
		||||
    {
 | 
			
		||||
        for (c = 0; c < 4; c++)
 | 
			
		||||
        {
 | 
			
		||||
            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        diff_ptr += pitch;
 | 
			
		||||
        pred_ptr += pitch;
 | 
			
		||||
        src_ptr  += src_stride;
 | 
			
		||||
    }
 | 
			
		||||
  vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
 | 
			
		||||
                     pred_ptr, pitch);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
 | 
			
		||||
void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
 | 
			
		||||
                         int src_stride, unsigned char *upred,
 | 
			
		||||
                         unsigned char *vpred, int pred_stride)
 | 
			
		||||
{
 | 
			
		||||
    short *udiff = diff + 256;
 | 
			
		||||
    short *vdiff = diff + 320;
 | 
			
		||||
                         unsigned char *vpred, int pred_stride) {
 | 
			
		||||
  short *udiff = diff + 256;
 | 
			
		||||
  short *vdiff = diff + 320;
 | 
			
		||||
 | 
			
		||||
    int r, c;
 | 
			
		||||
 | 
			
		||||
    for (r = 0; r < 8; r++)
 | 
			
		||||
    {
 | 
			
		||||
        for (c = 0; c < 8; c++)
 | 
			
		||||
        {
 | 
			
		||||
            udiff[c] = usrc[c] - upred[c];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        udiff += 8;
 | 
			
		||||
        upred += pred_stride;
 | 
			
		||||
        usrc  += src_stride;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (r = 0; r < 8; r++)
 | 
			
		||||
    {
 | 
			
		||||
        for (c = 0; c < 8; c++)
 | 
			
		||||
        {
 | 
			
		||||
            vdiff[c] = vsrc[c] - vpred[c];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        vdiff += 8;
 | 
			
		||||
        vpred += pred_stride;
 | 
			
		||||
        vsrc  += src_stride;
 | 
			
		||||
    }
 | 
			
		||||
  vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
 | 
			
		||||
  vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
 | 
			
		||||
                        unsigned char *pred, int pred_stride)
 | 
			
		||||
{
 | 
			
		||||
    int r, c;
 | 
			
		||||
 | 
			
		||||
    for (r = 0; r < 16; r++)
 | 
			
		||||
    {
 | 
			
		||||
        for (c = 0; c < 16; c++)
 | 
			
		||||
        {
 | 
			
		||||
            diff[c] = src[c] - pred[c];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        diff += 16;
 | 
			
		||||
        pred += pred_stride;
 | 
			
		||||
        src  += src_stride;
 | 
			
		||||
    }
 | 
			
		||||
void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
 | 
			
		||||
                      unsigned char *pred, int pred_stride) {
 | 
			
		||||
  vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void vp8_subtract_mb(MACROBLOCK *x)
 | 
			
		||||
 
 | 
			
		||||
@@ -19,6 +19,13 @@ extern "C" {
 | 
			
		||||
#endif
 | 
			
		||||
void vp8_encode_inter16x16(MACROBLOCK *x);
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
 | 
			
		||||
void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
 | 
			
		||||
                       int src_stride, unsigned char *upred,
 | 
			
		||||
                       unsigned char *vpred, int pred_stride);
 | 
			
		||||
void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
 | 
			
		||||
                      unsigned char *pred, int pred_stride);
 | 
			
		||||
 | 
			
		||||
void vp8_build_dcblock(MACROBLOCK *b);
 | 
			
		||||
void vp8_transform_mb(MACROBLOCK *mb);
 | 
			
		||||
void vp8_transform_mbuv(MACROBLOCK *x);
 | 
			
		||||
 
 | 
			
		||||
@@ -1,223 +0,0 @@
 | 
			
		||||
;
 | 
			
		||||
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 | 
			
		||||
;
 | 
			
		||||
;  Use of this source code is governed by a BSD-style license
 | 
			
		||||
;  that can be found in the LICENSE file in the root of the source
 | 
			
		||||
;  tree. An additional intellectual property rights grant can be found
 | 
			
		||||
;  in the file PATENTS.  All contributing project authors may
 | 
			
		||||
;  be found in the AUTHORS file in the root of the source tree.
 | 
			
		||||
;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
%include "vpx_ports/x86_abi_support.asm"
 | 
			
		||||
 | 
			
		||||
;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
 | 
			
		||||
;                            short *diff, unsigned char *Predictor,
 | 
			
		||||
;                            int pitch);
 | 
			
		||||
global sym(vp8_subtract_b_mmx_impl) PRIVATE
 | 
			
		||||
sym(vp8_subtract_b_mmx_impl):
 | 
			
		||||
    push        rbp
 | 
			
		||||
    mov         rbp, rsp
 | 
			
		||||
    SHADOW_ARGS_TO_STACK 5
 | 
			
		||||
    push rsi
 | 
			
		||||
    push rdi
 | 
			
		||||
    ; end prolog
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        mov     rdi,        arg(2) ;diff
 | 
			
		||||
        mov     rax,        arg(3) ;Predictor
 | 
			
		||||
        mov     rsi,        arg(0) ;z
 | 
			
		||||
        movsxd  rdx,        dword ptr arg(1);src_stride;
 | 
			
		||||
        movsxd  rcx,        dword ptr arg(4);pitch
 | 
			
		||||
        pxor    mm7,        mm7
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi]
 | 
			
		||||
        movd    mm1,        [rax]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    [rdi],      mm0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi+rdx]
 | 
			
		||||
        movd    mm1,        [rax+rcx]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    [rdi+rcx*2],mm0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi+rdx*2]
 | 
			
		||||
        movd    mm1,        [rax+rcx*2]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    [rdi+rcx*4],        mm0
 | 
			
		||||
 | 
			
		||||
        lea     rsi,        [rsi+rdx*2]
 | 
			
		||||
        lea     rcx,        [rcx+rcx*2]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi+rdx]
 | 
			
		||||
        movd    mm1,        [rax+rcx]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    [rdi+rcx*2],        mm0
 | 
			
		||||
 | 
			
		||||
    ; begin epilog
 | 
			
		||||
    pop rdi
 | 
			
		||||
    pop rsi
 | 
			
		||||
    UNSHADOW_ARGS
 | 
			
		||||
    pop         rbp
 | 
			
		||||
    ret
 | 
			
		||||
 | 
			
		||||
;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
 | 
			
		||||
;unsigned char *pred, int pred_stride)
 | 
			
		||||
global sym(vp8_subtract_mby_mmx) PRIVATE
 | 
			
		||||
sym(vp8_subtract_mby_mmx):
 | 
			
		||||
    push        rbp
 | 
			
		||||
    mov         rbp, rsp
 | 
			
		||||
    SHADOW_ARGS_TO_STACK 5
 | 
			
		||||
    push rsi
 | 
			
		||||
    push rdi
 | 
			
		||||
    ; end prolog
 | 
			
		||||
 | 
			
		||||
    mov         rdi,        arg(0)          ;diff
 | 
			
		||||
    mov         rsi,        arg(1)          ;src
 | 
			
		||||
    movsxd      rdx,        dword ptr arg(2);src_stride
 | 
			
		||||
    mov         rax,        arg(3)          ;pred
 | 
			
		||||
    push        rbx
 | 
			
		||||
    movsxd      rbx,        dword ptr arg(4);pred_stride
 | 
			
		||||
 | 
			
		||||
    pxor        mm0,        mm0
 | 
			
		||||
    mov         rcx,        16
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.submby_loop:
 | 
			
		||||
    movq        mm1,        [rsi]
 | 
			
		||||
    movq        mm3,        [rax]
 | 
			
		||||
 | 
			
		||||
    movq        mm2,        mm1
 | 
			
		||||
    movq        mm4,        mm3
 | 
			
		||||
 | 
			
		||||
    punpcklbw   mm1,        mm0
 | 
			
		||||
    punpcklbw   mm3,        mm0
 | 
			
		||||
 | 
			
		||||
    punpckhbw   mm2,        mm0
 | 
			
		||||
    punpckhbw   mm4,        mm0
 | 
			
		||||
 | 
			
		||||
    psubw       mm1,        mm3
 | 
			
		||||
    psubw       mm2,        mm4
 | 
			
		||||
 | 
			
		||||
    movq        [rdi],      mm1
 | 
			
		||||
    movq        [rdi+8],    mm2
 | 
			
		||||
 | 
			
		||||
    movq        mm1,        [rsi+8]
 | 
			
		||||
    movq        mm3,        [rax+8]
 | 
			
		||||
 | 
			
		||||
    movq        mm2,        mm1
 | 
			
		||||
    movq        mm4,        mm3
 | 
			
		||||
 | 
			
		||||
    punpcklbw   mm1,        mm0
 | 
			
		||||
    punpcklbw   mm3,        mm0
 | 
			
		||||
 | 
			
		||||
    punpckhbw   mm2,        mm0
 | 
			
		||||
    punpckhbw   mm4,        mm0
 | 
			
		||||
 | 
			
		||||
    psubw       mm1,        mm3
 | 
			
		||||
    psubw       mm2,        mm4
 | 
			
		||||
 | 
			
		||||
    movq        [rdi+16],   mm1
 | 
			
		||||
    movq        [rdi+24],   mm2
 | 
			
		||||
    add         rdi,        32
 | 
			
		||||
    lea         rax,        [rax+rbx]
 | 
			
		||||
    lea         rsi,        [rsi+rdx]
 | 
			
		||||
    dec         rcx
 | 
			
		||||
    jnz         .submby_loop
 | 
			
		||||
 | 
			
		||||
    pop rbx
 | 
			
		||||
    pop rdi
 | 
			
		||||
    pop rsi
 | 
			
		||||
    ; begin epilog
 | 
			
		||||
    UNSHADOW_ARGS
 | 
			
		||||
    pop         rbp
 | 
			
		||||
    ret
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
 | 
			
		||||
;                         int src_stride, unsigned char *upred,
 | 
			
		||||
;                         unsigned char *vpred, int pred_stride)
 | 
			
		||||
 | 
			
		||||
global sym(vp8_subtract_mbuv_mmx) PRIVATE
 | 
			
		||||
sym(vp8_subtract_mbuv_mmx):
 | 
			
		||||
    push        rbp
 | 
			
		||||
    mov         rbp, rsp
 | 
			
		||||
    SHADOW_ARGS_TO_STACK 7
 | 
			
		||||
    push rsi
 | 
			
		||||
    push rdi
 | 
			
		||||
    ; end prolog
 | 
			
		||||
 | 
			
		||||
    mov         rdi,        arg(0)          ;diff
 | 
			
		||||
    mov         rsi,        arg(1)          ;usrc
 | 
			
		||||
    movsxd      rdx,        dword ptr arg(3);src_stride;
 | 
			
		||||
    mov         rax,        arg(4)          ;upred
 | 
			
		||||
    add         rdi,        256*2           ;diff = diff + 256 (shorts)
 | 
			
		||||
    mov         rcx,        8
 | 
			
		||||
    push        rbx
 | 
			
		||||
    movsxd      rbx,        dword ptr arg(6);pred_stride
 | 
			
		||||
 | 
			
		||||
    pxor        mm7,        mm7
 | 
			
		||||
 | 
			
		||||
.submbu_loop:
 | 
			
		||||
    movq        mm0,        [rsi]
 | 
			
		||||
    movq        mm1,        [rax]
 | 
			
		||||
    movq        mm3,        mm0
 | 
			
		||||
    movq        mm4,        mm1
 | 
			
		||||
    punpcklbw   mm0,        mm7
 | 
			
		||||
    punpcklbw   mm1,        mm7
 | 
			
		||||
    punpckhbw   mm3,        mm7
 | 
			
		||||
    punpckhbw   mm4,        mm7
 | 
			
		||||
    psubw       mm0,        mm1
 | 
			
		||||
    psubw       mm3,        mm4
 | 
			
		||||
    movq        [rdi],      mm0
 | 
			
		||||
    movq        [rdi+8],    mm3
 | 
			
		||||
    add         rdi, 16
 | 
			
		||||
    add         rsi, rdx
 | 
			
		||||
    add         rax, rbx
 | 
			
		||||
 | 
			
		||||
    dec         rcx
 | 
			
		||||
    jnz         .submbu_loop
 | 
			
		||||
 | 
			
		||||
    mov         rsi,        arg(2)          ;vsrc
 | 
			
		||||
    mov         rax,        arg(5)          ;vpred
 | 
			
		||||
    mov         rcx,        8
 | 
			
		||||
 | 
			
		||||
.submbv_loop:
 | 
			
		||||
    movq        mm0,        [rsi]
 | 
			
		||||
    movq        mm1,        [rax]
 | 
			
		||||
    movq        mm3,        mm0
 | 
			
		||||
    movq        mm4,        mm1
 | 
			
		||||
    punpcklbw   mm0,        mm7
 | 
			
		||||
    punpcklbw   mm1,        mm7
 | 
			
		||||
    punpckhbw   mm3,        mm7
 | 
			
		||||
    punpckhbw   mm4,        mm7
 | 
			
		||||
    psubw       mm0,        mm1
 | 
			
		||||
    psubw       mm3,        mm4
 | 
			
		||||
    movq        [rdi],      mm0
 | 
			
		||||
    movq        [rdi+8],    mm3
 | 
			
		||||
    add         rdi, 16
 | 
			
		||||
    add         rsi, rdx
 | 
			
		||||
    add         rax, rbx
 | 
			
		||||
 | 
			
		||||
    dec         rcx
 | 
			
		||||
    jnz         .submbv_loop
 | 
			
		||||
 | 
			
		||||
    pop         rbx
 | 
			
		||||
    ; begin epilog
 | 
			
		||||
    pop rdi
 | 
			
		||||
    pop rsi
 | 
			
		||||
    UNSHADOW_ARGS
 | 
			
		||||
    pop         rbp
 | 
			
		||||
    ret
 | 
			
		||||
@@ -1,245 +0,0 @@
 | 
			
		||||
;
 | 
			
		||||
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 | 
			
		||||
;
 | 
			
		||||
;  Use of this source code is governed by a BSD-style license
 | 
			
		||||
;  that can be found in the LICENSE file in the root of the source
 | 
			
		||||
;  tree. An additional intellectual property rights grant can be found
 | 
			
		||||
;  in the file PATENTS.  All contributing project authors may
 | 
			
		||||
;  be found in the AUTHORS file in the root of the source tree.
 | 
			
		||||
;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
%include "vpx_ports/x86_abi_support.asm"
 | 
			
		||||
 | 
			
		||||
;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
 | 
			
		||||
;                            short *diff, unsigned char *Predictor,
 | 
			
		||||
;                            int pitch);
 | 
			
		||||
global sym(vp8_subtract_b_sse2_impl) PRIVATE
 | 
			
		||||
sym(vp8_subtract_b_sse2_impl):
 | 
			
		||||
    push        rbp
 | 
			
		||||
    mov         rbp, rsp
 | 
			
		||||
    SHADOW_ARGS_TO_STACK 5
 | 
			
		||||
    GET_GOT     rbx
 | 
			
		||||
    push rsi
 | 
			
		||||
    push rdi
 | 
			
		||||
    ; end prolog
 | 
			
		||||
 | 
			
		||||
        mov     rdi,        arg(2) ;diff
 | 
			
		||||
        mov     rax,        arg(3) ;Predictor
 | 
			
		||||
        mov     rsi,        arg(0) ;z
 | 
			
		||||
        movsxd  rdx,        dword ptr arg(1);src_stride;
 | 
			
		||||
        movsxd  rcx,        dword ptr arg(4);pitch
 | 
			
		||||
        pxor    mm7,        mm7
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi]
 | 
			
		||||
        movd    mm1,        [rax]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    MMWORD PTR [rdi],      mm0
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi+rdx]
 | 
			
		||||
        movd    mm1,        [rax+rcx]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    MMWORD PTR [rdi+rcx*2], mm0
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi+rdx*2]
 | 
			
		||||
        movd    mm1,        [rax+rcx*2]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    MMWORD PTR [rdi+rcx*4], mm0
 | 
			
		||||
 | 
			
		||||
        lea     rsi,        [rsi+rdx*2]
 | 
			
		||||
        lea     rcx,        [rcx+rcx*2]
 | 
			
		||||
 | 
			
		||||
        movd    mm0,        [rsi+rdx]
 | 
			
		||||
        movd    mm1,        [rax+rcx]
 | 
			
		||||
        punpcklbw   mm0,    mm7
 | 
			
		||||
        punpcklbw   mm1,    mm7
 | 
			
		||||
        psubw   mm0,        mm1
 | 
			
		||||
        movq    MMWORD PTR [rdi+rcx*2], mm0
 | 
			
		||||
 | 
			
		||||
    ; begin epilog
 | 
			
		||||
    pop rdi
 | 
			
		||||
    pop rsi
 | 
			
		||||
    RESTORE_GOT
 | 
			
		||||
    UNSHADOW_ARGS
 | 
			
		||||
    pop         rbp
 | 
			
		||||
    ret
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
 | 
			
		||||
;unsigned char *pred, int pred_stride)
 | 
			
		||||
global sym(vp8_subtract_mby_sse2) PRIVATE
 | 
			
		||||
sym(vp8_subtract_mby_sse2):
 | 
			
		||||
    push        rbp
 | 
			
		||||
    mov         rbp, rsp
 | 
			
		||||
    SHADOW_ARGS_TO_STACK 5
 | 
			
		||||
    GET_GOT     rbx
 | 
			
		||||
    push rsi
 | 
			
		||||
    push rdi
 | 
			
		||||
    ; end prolog
 | 
			
		||||
 | 
			
		||||
    mov         rdi,        arg(0)          ;diff
 | 
			
		||||
    mov         rsi,        arg(1)          ;src
 | 
			
		||||
    movsxd      rdx,        dword ptr arg(2);src_stride
 | 
			
		||||
    mov         rax,        arg(3)          ;pred
 | 
			
		||||
    movdqa      xmm4,       [GLOBAL(t80)]
 | 
			
		||||
    push        rbx
 | 
			
		||||
    mov         rcx,        8               ; do two lines at one time
 | 
			
		||||
    movsxd      rbx,        dword ptr arg(4);pred_stride
 | 
			
		||||
 | 
			
		||||
.submby_loop:
 | 
			
		||||
    movdqa      xmm0,       [rsi]           ; src
 | 
			
		||||
    movdqa      xmm1,       [rax]           ; pred
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm2,       xmm0
 | 
			
		||||
    psubb       xmm0,       xmm1
 | 
			
		||||
 | 
			
		||||
    pxor        xmm1,       xmm4            ;convert to signed values
 | 
			
		||||
    pxor        xmm2,       xmm4
 | 
			
		||||
    pcmpgtb     xmm1,       xmm2            ; obtain sign information
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm2,       xmm0
 | 
			
		||||
    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
 | 
			
		||||
    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm3,       [rsi + rdx]
 | 
			
		||||
    movdqa      xmm5,       [rax + rbx]
 | 
			
		||||
 | 
			
		||||
    lea         rsi,        [rsi+rdx*2]
 | 
			
		||||
    lea         rax,        [rax+rbx*2]
 | 
			
		||||
 | 
			
		||||
    movdqa      [rdi],      xmm0
 | 
			
		||||
    movdqa      [rdi +16],  xmm2
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm1,       xmm3
 | 
			
		||||
    psubb       xmm3,       xmm5
 | 
			
		||||
 | 
			
		||||
    pxor        xmm5,       xmm4            ;convert to signed values
 | 
			
		||||
    pxor        xmm1,       xmm4
 | 
			
		||||
    pcmpgtb     xmm5,       xmm1            ; obtain sign information
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm1,       xmm3
 | 
			
		||||
    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
 | 
			
		||||
    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
 | 
			
		||||
 | 
			
		||||
    movdqa      [rdi +32],  xmm3
 | 
			
		||||
    movdqa      [rdi +48],  xmm1
 | 
			
		||||
 | 
			
		||||
    add         rdi,        64
 | 
			
		||||
    dec         rcx
 | 
			
		||||
    jnz         .submby_loop
 | 
			
		||||
 | 
			
		||||
    pop rbx
 | 
			
		||||
    pop rdi
 | 
			
		||||
    pop rsi
 | 
			
		||||
    ; begin epilog
 | 
			
		||||
    RESTORE_GOT
 | 
			
		||||
    UNSHADOW_ARGS
 | 
			
		||||
    pop         rbp
 | 
			
		||||
    ret
 | 
			
		||||
 | 
			
		||||
;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
 | 
			
		||||
;                         int src_stride, unsigned char *upred,
 | 
			
		||||
;                         unsigned char *vpred, int pred_stride)
 | 
			
		||||
global sym(vp8_subtract_mbuv_sse2) PRIVATE
 | 
			
		||||
sym(vp8_subtract_mbuv_sse2):
 | 
			
		||||
    push        rbp
 | 
			
		||||
    mov         rbp, rsp
 | 
			
		||||
    SHADOW_ARGS_TO_STACK 7
 | 
			
		||||
    GET_GOT     rbx
 | 
			
		||||
    push rsi
 | 
			
		||||
    push rdi
 | 
			
		||||
    ; end prolog
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm4,       [GLOBAL(t80)]
 | 
			
		||||
    mov         rdi,        arg(0)          ;diff
 | 
			
		||||
    mov         rsi,        arg(1)          ;usrc
 | 
			
		||||
    movsxd      rdx,        dword ptr arg(3);src_stride;
 | 
			
		||||
    mov         rax,        arg(4)          ;upred
 | 
			
		||||
    add         rdi,        256*2           ;diff = diff + 256 (shorts)
 | 
			
		||||
    mov         rcx,        4
 | 
			
		||||
    push        rbx
 | 
			
		||||
    movsxd      rbx,        dword ptr arg(6);pred_stride
 | 
			
		||||
 | 
			
		||||
    ;u
 | 
			
		||||
.submbu_loop:
 | 
			
		||||
    movq        xmm0,       [rsi]           ; src
 | 
			
		||||
    movq        xmm2,       [rsi+rdx]       ; src -- next line
 | 
			
		||||
    movq        xmm1,       [rax]           ; pred
 | 
			
		||||
    movq        xmm3,       [rax+rbx]       ; pred -- next line
 | 
			
		||||
    lea         rsi,        [rsi + rdx*2]
 | 
			
		||||
    lea         rax,        [rax + rbx*2]
 | 
			
		||||
 | 
			
		||||
    punpcklqdq  xmm0,       xmm2
 | 
			
		||||
    punpcklqdq  xmm1,       xmm3
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm2,       xmm0
 | 
			
		||||
    psubb       xmm0,       xmm1            ; subtraction with sign missed
 | 
			
		||||
 | 
			
		||||
    pxor        xmm1,       xmm4            ;convert to signed values
 | 
			
		||||
    pxor        xmm2,       xmm4
 | 
			
		||||
    pcmpgtb     xmm1,       xmm2            ; obtain sign information
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm2,       xmm0
 | 
			
		||||
    movdqa      xmm3,       xmm1
 | 
			
		||||
    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
 | 
			
		||||
    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
 | 
			
		||||
 | 
			
		||||
    movdqa      [rdi],      xmm0            ; store difference
 | 
			
		||||
    movdqa      [rdi +16],  xmm2            ; store difference
 | 
			
		||||
    add         rdi,        32
 | 
			
		||||
    sub         rcx, 1
 | 
			
		||||
    jnz         .submbu_loop
 | 
			
		||||
 | 
			
		||||
    mov         rsi,        arg(2)          ;vsrc
 | 
			
		||||
    mov         rax,        arg(5)          ;vpred
 | 
			
		||||
    mov         rcx,        4
 | 
			
		||||
 | 
			
		||||
    ;v
 | 
			
		||||
.submbv_loop:
 | 
			
		||||
    movq        xmm0,       [rsi]           ; src
 | 
			
		||||
    movq        xmm2,       [rsi+rdx]       ; src -- next line
 | 
			
		||||
    movq        xmm1,       [rax]           ; pred
 | 
			
		||||
    movq        xmm3,       [rax+rbx]       ; pred -- next line
 | 
			
		||||
    lea         rsi,        [rsi + rdx*2]
 | 
			
		||||
    lea         rax,        [rax + rbx*2]
 | 
			
		||||
 | 
			
		||||
    punpcklqdq  xmm0,       xmm2
 | 
			
		||||
    punpcklqdq  xmm1,       xmm3
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm2,       xmm0
 | 
			
		||||
    psubb       xmm0,       xmm1            ; subtraction with sign missed
 | 
			
		||||
 | 
			
		||||
    pxor        xmm1,       xmm4            ;convert to signed values
 | 
			
		||||
    pxor        xmm2,       xmm4
 | 
			
		||||
    pcmpgtb     xmm1,       xmm2            ; obtain sign information
 | 
			
		||||
 | 
			
		||||
    movdqa      xmm2,       xmm0
 | 
			
		||||
    movdqa      xmm3,       xmm1
 | 
			
		||||
    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
 | 
			
		||||
    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
 | 
			
		||||
 | 
			
		||||
    movdqa      [rdi],      xmm0            ; store difference
 | 
			
		||||
    movdqa      [rdi +16],  xmm2            ; store difference
 | 
			
		||||
    add         rdi,        32
 | 
			
		||||
    sub         rcx, 1
 | 
			
		||||
    jnz         .submbv_loop
 | 
			
		||||
 | 
			
		||||
    pop         rbx
 | 
			
		||||
    ; begin epilog
 | 
			
		||||
    pop rdi
 | 
			
		||||
    pop rsi
 | 
			
		||||
    RESTORE_GOT
 | 
			
		||||
    UNSHADOW_ARGS
 | 
			
		||||
    pop         rbp
 | 
			
		||||
    ret
 | 
			
		||||
 | 
			
		||||
SECTION_RODATA
 | 
			
		||||
align 16
 | 
			
		||||
t80:
 | 
			
		||||
    times 16 db 0x80
 | 
			
		||||
@@ -65,14 +65,3 @@ int vp8_mbuverror_mmx(MACROBLOCK *mb)
 | 
			
		||||
    return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
 | 
			
		||||
                             short *diff, unsigned char *predictor,
 | 
			
		||||
                             int pitch);
 | 
			
		||||
void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 | 
			
		||||
{
 | 
			
		||||
    unsigned char *z = *(be->base_src) + be->src;
 | 
			
		||||
    unsigned int  src_stride = be->src_stride;
 | 
			
		||||
    short *diff = &be->src_diff[0];
 | 
			
		||||
    unsigned char *predictor = &bd->predictor[0];
 | 
			
		||||
    vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -30,14 +30,3 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
 | 
			
		||||
    return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
 | 
			
		||||
                             short *diff, unsigned char *predictor,
 | 
			
		||||
                             int pitch);
 | 
			
		||||
void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
 | 
			
		||||
{
 | 
			
		||||
    unsigned char *z = *(be->base_src) + be->src;
 | 
			
		||||
    unsigned int  src_stride = be->src_stride;
 | 
			
		||||
    short *diff = &be->src_diff[0];
 | 
			
		||||
    unsigned char *predictor = &bd->predictor[0];
 | 
			
		||||
    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -82,7 +82,6 @@ VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 | 
			
		||||
@@ -94,7 +93,6 @@ ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
 | 
			
		||||
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 | 
			
		||||
 
 | 
			
		||||
@@ -25,5 +25,4 @@ VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
 | 
			
		||||
VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user