Merge "Implement sse2 and ssse3 versions for all sub_pixel_variance sizes."

This commit is contained in:
Yaowu Xu 2013-06-20 17:42:50 -07:00 committed by Gerrit Code Review
commit e6cd5ed307
11 changed files with 1488 additions and 1860 deletions

View File

@ -26,12 +26,55 @@ extern "C" {
# include "vp9_rtcd.h"
#endif
}
#include "test/acm_random.h"
namespace {
using ::std::tr1::get;
using ::std::tr1::make_tuple;
using ::std::tr1::tuple;
using libvpx_test::ACMRandom;
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
int l2w, int l2h, unsigned int *sse_ptr) {
int se = 0;
unsigned int sse = 0;
const int w = 1 << l2w, h = 1 << l2h;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
int diff = ref[w * y + x] - src[w * y + x];
se += diff;
sse += diff * diff;
}
}
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
int l2w, int l2h, int xoff, int yoff,
unsigned int *sse_ptr) {
int se = 0;
unsigned int sse = 0;
const int w = 1 << l2w, h = 1 << l2h;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
// bilinear interpolation at a 16th pel step
const int a1 = ref[(w + 1) * (y + 0) + x + 0];
const int a2 = ref[(w + 1) * (y + 0) + x + 1];
const int b1 = ref[(w + 1) * (y + 1) + x + 0];
const int b2 = ref[(w + 1) * (y + 1) + x + 1];
const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
const int r = a + (((b - a) * yoff + 8) >> 4);
int diff = r - src[w * y + x];
se += diff;
sse += diff * diff;
}
}
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
template<typename VarianceFunctionType>
class VarianceTest :
@ -39,10 +82,13 @@ class VarianceTest :
public:
virtual void SetUp() {
const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
width_ = get<0>(params);
height_ = get<1>(params);
log2width_ = get<0>(params);
width_ = 1 << log2width_;
log2height_ = get<1>(params);
height_ = 1 << log2height_;
variance_ = get<2>(params);
rnd(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = new uint8_t[block_size_];
ref_ = new uint8_t[block_size_];
@ -58,15 +104,16 @@ class VarianceTest :
protected:
void ZeroTest();
void RefTest();
void OneQuarterTest();
ACMRandom rnd;
uint8_t* src_;
uint8_t* ref_;
int width_;
int height_;
int width_, log2width_;
int height_, log2height_;
int block_size_;
VarianceFunctionType variance_;
};
template<typename VarianceFunctionType>
@ -82,6 +129,22 @@ void VarianceTest<VarianceFunctionType>::ZeroTest() {
}
}
template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::RefTest() {
for (int i = 0; i < 10; ++i) {
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd.Rand8();
ref_[j] = rnd.Rand8();
}
unsigned int sse1, sse2;
const unsigned int var1 = variance_(src_, width_, ref_, width_, &sse1);
const unsigned int var2 = variance_ref(src_, ref_, log2width_,
log2height_, &sse2);
EXPECT_EQ(sse1, sse2);
EXPECT_EQ(var1, var2);
}
}
template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
memset(src_, 255, block_size_);
@ -94,6 +157,66 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
EXPECT_EQ(expected, var);
}
template<typename SubpelVarianceFunctionType>
class SubpelVarianceTest :
public ::testing::TestWithParam<tuple<int, int,
SubpelVarianceFunctionType> > {
public:
virtual void SetUp() {
const tuple<int, int, SubpelVarianceFunctionType>& params =
this->GetParam();
log2width_ = get<0>(params);
width_ = 1 << log2width_;
log2height_ = get<1>(params);
height_ = 1 << log2height_;
subpel_variance_ = get<2>(params);
rnd(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = new uint8_t[block_size_];
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(ref_ != NULL);
}
virtual void TearDown() {
delete[] src_;
delete[] ref_;
}
protected:
void RefTest();
ACMRandom rnd;
uint8_t* src_;
uint8_t* ref_;
int width_, log2width_;
int height_, log2height_;
int block_size_;
SubpelVarianceFunctionType subpel_variance_;
};
template<typename SubpelVarianceFunctionType>
void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd.Rand8();
}
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
ref_[j] = rnd.Rand8();
}
unsigned int sse1, sse2;
const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
src_, width_, &sse1);
const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
log2height_, x, y, &sse2);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
}
}
// -----------------------------------------------------------------------------
// VP8 test cases.
@ -103,6 +226,7 @@ namespace vp8 {
typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
TEST_P(VP8VarianceTest, Ref) { RefTest(); }
TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
@ -112,11 +236,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
INSTANTIATE_TEST_CASE_P(
C, VP8VarianceTest,
::testing::Values(make_tuple(4, 4, variance4x4_c),
make_tuple(8, 8, variance8x8_c),
make_tuple(8, 16, variance8x16_c),
make_tuple(16, 8, variance16x8_c),
make_tuple(16, 16, variance16x16_c)));
::testing::Values(make_tuple(2, 2, variance4x4_c),
make_tuple(3, 3, variance8x8_c),
make_tuple(3, 4, variance8x16_c),
make_tuple(4, 3, variance16x8_c),
make_tuple(4, 4, variance16x16_c)));
#if HAVE_MMX
const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
@ -126,11 +250,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
INSTANTIATE_TEST_CASE_P(
MMX, VP8VarianceTest,
::testing::Values(make_tuple(4, 4, variance4x4_mmx),
make_tuple(8, 8, variance8x8_mmx),
make_tuple(8, 16, variance8x16_mmx),
make_tuple(16, 8, variance16x8_mmx),
make_tuple(16, 16, variance16x16_mmx)));
::testing::Values(make_tuple(2, 2, variance4x4_mmx),
make_tuple(3, 3, variance8x8_mmx),
make_tuple(3, 4, variance8x16_mmx),
make_tuple(4, 3, variance16x8_mmx),
make_tuple(4, 4, variance16x16_mmx)));
#endif
#if HAVE_SSE2
@ -141,11 +265,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
INSTANTIATE_TEST_CASE_P(
SSE2, VP8VarianceTest,
::testing::Values(make_tuple(4, 4, variance4x4_wmt),
make_tuple(8, 8, variance8x8_wmt),
make_tuple(8, 16, variance8x16_wmt),
make_tuple(16, 8, variance16x8_wmt),
make_tuple(16, 16, variance16x16_wmt)));
::testing::Values(make_tuple(2, 2, variance4x4_wmt),
make_tuple(3, 3, variance8x8_wmt),
make_tuple(3, 4, variance8x16_wmt),
make_tuple(4, 3, variance16x8_wmt),
make_tuple(4, 4, variance16x16_wmt)));
#endif
#endif // CONFIG_VP8_ENCODER
@ -158,22 +282,83 @@ namespace vp9 {
#if CONFIG_VP9_ENCODER
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9VarianceTest,
::testing::Values(make_tuple(4, 4, variance4x4_c),
make_tuple(8, 8, variance8x8_c),
make_tuple(8, 16, variance8x16_c),
make_tuple(16, 8, variance16x8_c),
make_tuple(16, 16, variance16x16_c)));
::testing::Values(make_tuple(2, 2, variance4x4_c),
make_tuple(2, 3, variance4x8_c),
make_tuple(3, 2, variance8x4_c),
make_tuple(3, 3, variance8x8_c),
make_tuple(3, 4, variance8x16_c),
make_tuple(4, 3, variance16x8_c),
make_tuple(4, 4, variance16x16_c),
make_tuple(4, 5, variance16x32_c),
make_tuple(5, 4, variance32x16_c),
make_tuple(5, 5, variance32x32_c),
make_tuple(5, 6, variance32x64_c),
make_tuple(6, 5, variance64x32_c),
make_tuple(6, 6, variance64x64_c)));
const vp9_subpixvariance_fn_t subpel_variance4x4_c =
vp9_sub_pixel_variance4x4_c;
const vp9_subpixvariance_fn_t subpel_variance4x8_c =
vp9_sub_pixel_variance4x8_c;
const vp9_subpixvariance_fn_t subpel_variance8x4_c =
vp9_sub_pixel_variance8x4_c;
const vp9_subpixvariance_fn_t subpel_variance8x8_c =
vp9_sub_pixel_variance8x8_c;
const vp9_subpixvariance_fn_t subpel_variance8x16_c =
vp9_sub_pixel_variance8x16_c;
const vp9_subpixvariance_fn_t subpel_variance16x8_c =
vp9_sub_pixel_variance16x8_c;
const vp9_subpixvariance_fn_t subpel_variance16x16_c =
vp9_sub_pixel_variance16x16_c;
const vp9_subpixvariance_fn_t subpel_variance16x32_c =
vp9_sub_pixel_variance16x32_c;
const vp9_subpixvariance_fn_t subpel_variance32x16_c =
vp9_sub_pixel_variance32x16_c;
const vp9_subpixvariance_fn_t subpel_variance32x32_c =
vp9_sub_pixel_variance32x32_c;
const vp9_subpixvariance_fn_t subpel_variance32x64_c =
vp9_sub_pixel_variance32x64_c;
const vp9_subpixvariance_fn_t subpel_variance64x32_c =
vp9_sub_pixel_variance64x32_c;
const vp9_subpixvariance_fn_t subpel_variance64x64_c =
vp9_sub_pixel_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9SubpelVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
make_tuple(2, 3, subpel_variance4x8_c),
make_tuple(3, 2, subpel_variance8x4_c),
make_tuple(3, 3, subpel_variance8x8_c),
make_tuple(3, 4, subpel_variance8x16_c),
make_tuple(4, 3, subpel_variance16x8_c),
make_tuple(4, 4, subpel_variance16x16_c),
make_tuple(4, 5, subpel_variance16x32_c),
make_tuple(5, 4, subpel_variance32x16_c),
make_tuple(5, 5, subpel_variance32x32_c),
make_tuple(5, 6, subpel_variance32x64_c),
make_tuple(6, 5, subpel_variance64x32_c),
make_tuple(6, 6, subpel_variance64x64_c)));
#if HAVE_MMX
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
@ -183,26 +368,128 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
INSTANTIATE_TEST_CASE_P(
MMX, VP9VarianceTest,
::testing::Values(make_tuple(4, 4, variance4x4_mmx),
make_tuple(8, 8, variance8x8_mmx),
make_tuple(8, 16, variance8x16_mmx),
make_tuple(16, 8, variance16x8_mmx),
make_tuple(16, 16, variance16x16_mmx)));
::testing::Values(make_tuple(2, 2, variance4x4_mmx),
make_tuple(3, 3, variance8x8_mmx),
make_tuple(3, 4, variance8x16_mmx),
make_tuple(4, 3, variance16x8_mmx),
make_tuple(4, 4, variance16x16_mmx)));
#endif
#if HAVE_SSE2
const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2;
const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2;
const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2;
const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2;
const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2;
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9VarianceTest,
::testing::Values(make_tuple(4, 4, variance4x4_wmt),
make_tuple(8, 8, variance8x8_wmt),
make_tuple(8, 16, variance8x16_wmt),
make_tuple(16, 8, variance16x8_wmt),
make_tuple(16, 16, variance16x16_wmt)));
::testing::Values(make_tuple(2, 2, variance4x4_sse2),
make_tuple(2, 3, variance4x8_sse2),
make_tuple(3, 2, variance8x4_sse2),
make_tuple(3, 3, variance8x8_sse2),
make_tuple(3, 4, variance8x16_sse2),
make_tuple(4, 3, variance16x8_sse2),
make_tuple(4, 4, variance16x16_sse2),
make_tuple(4, 5, variance16x32_sse2),
make_tuple(5, 4, variance32x16_sse2),
make_tuple(5, 5, variance32x32_sse2),
make_tuple(5, 6, variance32x64_sse2),
make_tuple(6, 5, variance64x32_sse2),
make_tuple(6, 6, variance64x64_sse2)));
const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
vp9_sub_pixel_variance4x4_sse;
const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
vp9_sub_pixel_variance4x8_sse;
const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 =
vp9_sub_pixel_variance8x4_sse2;
const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 =
vp9_sub_pixel_variance8x8_sse2;
const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 =
vp9_sub_pixel_variance8x16_sse2;
const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 =
vp9_sub_pixel_variance16x8_sse2;
const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 =
vp9_sub_pixel_variance16x16_sse2;
const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 =
vp9_sub_pixel_variance16x32_sse2;
const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 =
vp9_sub_pixel_variance32x16_sse2;
const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 =
vp9_sub_pixel_variance32x32_sse2;
const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 =
vp9_sub_pixel_variance32x64_sse2;
const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 =
vp9_sub_pixel_variance64x32_sse2;
const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 =
vp9_sub_pixel_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9SubpelVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
make_tuple(2, 3, subpel_variance4x8_sse),
make_tuple(3, 2, subpel_variance8x4_sse2),
make_tuple(3, 3, subpel_variance8x8_sse2),
make_tuple(3, 4, subpel_variance8x16_sse2),
make_tuple(4, 3, subpel_variance16x8_sse2),
make_tuple(4, 4, subpel_variance16x16_sse2),
make_tuple(4, 5, subpel_variance16x32_sse2),
make_tuple(5, 4, subpel_variance32x16_sse2),
make_tuple(5, 5, subpel_variance32x32_sse2),
make_tuple(5, 6, subpel_variance32x64_sse2),
make_tuple(6, 5, subpel_variance64x32_sse2),
make_tuple(6, 6, subpel_variance64x64_sse2)));
#endif
#if HAVE_SSSE3
const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
vp9_sub_pixel_variance4x4_ssse3;
const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
vp9_sub_pixel_variance4x8_ssse3;
const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 =
vp9_sub_pixel_variance8x4_ssse3;
const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 =
vp9_sub_pixel_variance8x8_ssse3;
const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 =
vp9_sub_pixel_variance8x16_ssse3;
const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 =
vp9_sub_pixel_variance16x8_ssse3;
const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 =
vp9_sub_pixel_variance16x16_ssse3;
const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 =
vp9_sub_pixel_variance16x32_ssse3;
const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 =
vp9_sub_pixel_variance32x16_ssse3;
const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 =
vp9_sub_pixel_variance32x32_ssse3;
const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 =
vp9_sub_pixel_variance32x64_ssse3;
const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 =
vp9_sub_pixel_variance64x32_ssse3;
const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 =
vp9_sub_pixel_variance64x64_ssse3;
INSTANTIATE_TEST_CASE_P(
SSSE3, VP9SubpelVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
make_tuple(2, 3, subpel_variance4x8_ssse3),
make_tuple(3, 2, subpel_variance8x4_ssse3),
make_tuple(3, 3, subpel_variance8x8_ssse3),
make_tuple(3, 4, subpel_variance8x16_ssse3),
make_tuple(4, 3, subpel_variance16x8_ssse3),
make_tuple(4, 4, subpel_variance16x16_ssse3),
make_tuple(4, 5, subpel_variance16x32_ssse3),
make_tuple(5, 4, subpel_variance32x16_ssse3),
make_tuple(5, 5, subpel_variance32x32_ssse3),
make_tuple(5, 6, subpel_variance32x64_ssse3),
make_tuple(6, 5, subpel_variance64x32_ssse3),
make_tuple(6, 6, subpel_variance64x64_ssse3)));
#endif
#endif // CONFIG_VP9_ENCODER

View File

@ -266,85 +266,81 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
specialize vp9_variance4x4 mmx sse2
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x64 sse2
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x64
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x64
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x32
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x32
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x16
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x16
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x32
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x32
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x16
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x16 sse2 mmx
vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x16
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x8
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x8 sse2 mmx
vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x8
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x4
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x4
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x8
specialize vp9_sub_pixel_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x8
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x4 sse2 mmx
vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
specialize vp9_sub_pixel_variance4x4 sse ssse3
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x4
@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co
specialize vp9_sad4x4 mmx sse
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_h mmx sse2
specialize vp9_variance_halfpixvar16x16_h sse2
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_v mmx sse2
specialize vp9_variance_halfpixvar16x16_v sse2
vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_hv mmx sse2
specialize vp9_variance_halfpixvar16x16_hv sse2
vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
specialize vp9_sub_pixel_mse16x16 sse2 mmx
#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
#specialize vp9_sub_pixel_mse16x16 sse2 mmx
prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"
specialize vp9_mse16x16 mmx sse2

File diff suppressed because it is too large Load Diff

View File

@ -8,292 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define xmm_filter_shift 7
;void vp9_filter_block2d_bil_var_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int xoffset,
; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
;)
global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
sym(vp9_filter_block2d_bil_var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
push rbx
; end prolog
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
movdqa xmm4, XMMWORD PTR [rsi]
lea rcx, [GLOBAL(bilinear_filters_sse2)]
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je filter_block2d_bil_var_sse2_sp_only
shl rax, 5 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
je filter_block2d_bil_var_sse2_fp_only
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm5, xmm1
movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
lea rsi, [rsi + rbx]
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
filter_block2d_bil_var_sse2_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm3, xmm5 ;
movdqa xmm5, xmm1 ;
pmullw xmm3, [rdx] ;
pmullw xmm1, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1 ;
jnz filter_block2d_bil_var_sse2_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
je filter_block2d_bil_var_sse2_full_pixel
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
punpcklbw xmm1, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
lea rsi, [rsi + rax]
filter_block2d_bil_sp_only_loop:
movq xmm3, QWORD PTR [rsi] ;
punpcklbw xmm3, xmm0 ;
movdqa xmm5, xmm3
pmullw xmm1, [rdx] ;
pmullw xmm3, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
movdqa xmm1, xmm5 ;
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_sp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0 ;
filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi] ;
punpcklbw xmm1, xmm0 ;
movq xmm2, QWORD PTR [rdi] ;
punpcklbw xmm2, xmm0 ;
psubw xmm1, xmm2 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_full_pixel_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
filter_block2d_bil_fp_only_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rdx]
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_fp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_variance:
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
psrldq xmm6, 8
psrldq xmm7, 8
movdq2q mm2, xmm6
movdq2q mm3, xmm7
paddw mm6, mm2
paddd mm7, mm3
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rsi, arg(7) ; sum
mov rdi, arg(8) ; sumsquared
movd [rsi], mm2 ; xsum
movd [rdi], mm4 ; xxsum
; begin epilog
pop rbx
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_half_horiz_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
align 16
xmm_bi_rd:
times 8 dw 64
align 16
bilinear_filters_sse2:
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120

View File

@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
UNSHADOW_ARGS
pop rbp
ret
%define mmx_filter_shift 7
;void vp9_filter_block2d_bil4x4_var_mmx
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
sym(vp9_filter_block2d_bil4x4_var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rax, arg(4) ;HFilter ;
mov rdx, arg(5) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
mov rcx, 4 ;
pxor mm0, mm0 ;
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm5, mm1
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
add rsi, r8
%endif
.filter_block2d_bil4x4_var_mmx_loop:
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm3, mm5 ;
movq mm5, mm1 ;
pmullw mm3, [rdx] ;
pmullw mm1, [rdx+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movd mm3, [rdi] ;
punpcklbw mm3, mm0 ;
psubw mm1, mm3 ;
paddw mm6, mm1 ;
pmaddwd mm1, mm1 ;
paddd mm7, mm1 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz .filter_block2d_bil4x4_var_mmx_loop ;
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rdi, arg(6) ;sum
mov rsi, arg(7) ;sumsquared
movd dword ptr [rdi], mm2 ;
movd dword ptr [rsi], mm4 ;
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block2d_bil_var_mmx
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
sym(vp9_filter_block2d_bil_var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rax, arg(5) ;HFilter ;
mov rdx, arg(6) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
pxor mm0, mm0 ;
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
movq mm2, mm1 ;
movq mm4, mm3 ;
punpcklbw mm1, mm0 ;
punpckhbw mm2, mm0 ;
pmullw mm1, [rax] ;
pmullw mm2, [rax] ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
pmullw mm3, [rax+8] ;
pmullw mm4, [rax+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm5, mm1
packuswb mm5, mm2 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
add rsi, r8
%endif
.filter_block2d_bil_var_mmx_loop:
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
movq mm2, mm1 ;
movq mm4, mm3 ;
punpcklbw mm1, mm0 ;
punpckhbw mm2, mm0 ;
pmullw mm1, [rax] ;
pmullw mm2, [rax] ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
pmullw mm3, [rax+8] ;
pmullw mm4, [rax+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm3, mm5 ;
movq mm4, mm5 ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
movq mm5, mm1 ;
packuswb mm5, mm2 ;
pmullw mm3, [rdx] ;
pmullw mm4, [rdx] ;
pmullw mm1, [rdx+8] ;
pmullw mm2, [rdx+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
psraw mm2, mmx_filter_shift ;
movq mm3, [rdi] ;
movq mm4, mm3 ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
psubw mm1, mm3 ;
psubw mm2, mm4 ;
paddw mm6, mm1 ;
pmaddwd mm1, mm1 ;
paddw mm6, mm2 ;
pmaddwd mm2, mm2 ;
paddd mm7, mm1 ;
paddd mm7, mm2 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz .filter_block2d_bil_var_mmx_loop ;
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rdi, arg(7) ;sum
mov rsi, arg(8) ;sumsquared
movd dword ptr [rdi], mm2 ;
movd dword ptr [rsi], mm4 ;
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
align 16
mmx_bi_rd:
times 4 dw 64

View File

@ -11,8 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
%define xmm_filter_shift 7
;unsigned int vp9_get_mb_ss_sse2
;(
; short *src_ptr
@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2):
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
align 16
xmm_bi_rd:
times 8 dw 64
align 16
bilinear_filters_sse2:
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120

View File

@ -1,372 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define xmm_filter_shift 7
;void vp9_filter_block2d_bil_var_ssse3
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int xoffset,
; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
;)
;Note: The filter coefficient at offset=0 is 128. Since the second register
;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
sym(vp9_filter_block2d_bil_var_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6
pxor xmm7, xmm7
lea rcx, [GLOBAL(bilinear_filters_ssse3)]
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je .filter_block2d_bil_var_ssse3_sp_only
shl rax, 4 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
je .filter_block2d_bil_var_ssse3_fp_only
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi+1]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1
punpckhbw xmm2, xmm1
pmaddubsw xmm0, [rax]
pmaddubsw xmm2, [rax]
paddw xmm0, [GLOBAL(xmm_bi_rd)]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
psraw xmm0, xmm_filter_shift
psraw xmm2, xmm_filter_shift
packuswb xmm0, xmm2
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
lea rsi, [rsi + r8]
%endif
.filter_block2d_bil_var_ssse3_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
punpcklbw xmm1, xmm2
punpckhbw xmm3, xmm2
pmaddubsw xmm1, [rax]
pmaddubsw xmm3, [rax]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm3, xmm_filter_shift
packuswb xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm0, xmm1
movdqa xmm3, xmm2
punpcklbw xmm2, xmm1
punpckhbw xmm3, xmm1
pmaddubsw xmm2, [rdx]
pmaddubsw xmm3, [rdx]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm2, xmm_filter_shift
psraw xmm3, xmm_filter_shift
movq xmm1, QWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm1, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm2, xmm1
psubw xmm3, xmm5
paddw xmm6, xmm2
paddw xmm6, xmm3
pmaddwd xmm2, xmm2
pmaddwd xmm3, xmm3
paddd xmm7, xmm2
paddd xmm7, xmm3
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rsi, [rsi + r8]
lea rdi, [rdi + r9]
%endif
sub rcx, 1
jnz .filter_block2d_bil_var_ssse3_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_var_ssse3_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
je .filter_block2d_bil_var_ssse3_full_pixel
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movdqu xmm1, XMMWORD PTR [rsi]
movdqa xmm0, xmm1
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
lea rsi, [rsi + rax]
.filter_block2d_bil_sp_only_loop:
movdqu xmm3, XMMWORD PTR [rsi]
movdqa xmm2, xmm1
movdqa xmm0, xmm3
punpcklbw xmm1, xmm3
punpckhbw xmm2, xmm3
pmaddubsw xmm1, [rdx]
pmaddubsw xmm2, [rdx]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm2, xmm_filter_shift
movq xmm3, QWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm3, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm1, xmm3
psubw xmm2, xmm5
paddw xmm6, xmm1
paddw xmm6, xmm2
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm7, xmm1
paddd xmm7, xmm2
movdqa xmm1, xmm0
lea rsi, [rsi + rax] ;ref_pixels_per_line
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1
jnz .filter_block2d_bil_sp_only_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_var_ssse3_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0
.filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi]
punpcklbw xmm1, xmm0
movq xmm2, QWORD PTR [rsi+8]
punpcklbw xmm2, xmm0
movq xmm3, QWORD PTR [rdi]
punpcklbw xmm3, xmm0
movq xmm4, QWORD PTR [rdi+8]
punpcklbw xmm4, xmm0
psubw xmm1, xmm3
psubw xmm2, xmm4
paddw xmm6, xmm1
paddw xmm6, xmm2
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm7, xmm1
paddd xmm7, xmm2
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rdx] ;src_pixels_per_line
sub rcx, 1
jnz .filter_block2d_bil_full_pixel_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_var_ssse3_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
.filter_block2d_bil_fp_only_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
punpcklbw xmm1, xmm2
punpckhbw xmm3, xmm2
pmaddubsw xmm1, [rax]
pmaddubsw xmm3, [rax]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm3, xmm_filter_shift
movq xmm2, XMMWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm2, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm1, xmm2
psubw xmm3, xmm5
paddw xmm6, xmm1
paddw xmm6, xmm3
pmaddwd xmm1, xmm1
pmaddwd xmm3, xmm3
paddd xmm7, xmm1
paddd xmm7, xmm3
lea rsi, [rsi + rdx]
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1
jnz .filter_block2d_bil_fp_only_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_variance:
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(7) ;[Sum]
mov rdi, arg(8) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
xmm_bi_rd:
times 8 dw 64
align 16
bilinear_filters_ssse3:
times 8 db 128, 0
times 8 db 120, 8
times 8 db 112, 16
times 8 db 104, 24
times 8 db 96, 32
times 8 db 88, 40
times 8 db 80, 48
times 8 db 72, 56
times 8 db 64, 64
times 8 db 56, 72
times 8 db 48, 80
times 8 db 40, 88
times 8 db 32, 96
times 8 db 24, 104
times 8 db 16, 112
times 8 db 8, 120

View File

@ -13,27 +13,6 @@
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
extern void filter_block1d_h6_mmx
(
const unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
short *vp7_filter
);
extern void filter_block1d_v6_mmx
(
const short *src_ptr,
unsigned char *output_ptr,
unsigned int pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
short *vp7_filter
);
extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
extern unsigned int vp9_get8x8var_mmx
(
@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx
unsigned int *SSE,
int *Sum
);
extern void vp9_filter_block2d_bil4x4_var_mmx
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
const short *HFilter,
const short *VFilter,
int *sum,
unsigned int *sumsquared
);
extern void vp9_filter_block2d_bil_var_mmx
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
const short *HFilter,
const short *VFilter,
int *sum,
unsigned int *sumsquared
);
unsigned int vp9_variance4x4_mmx(
const unsigned char *src_ptr,
@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx(
return (var - (((unsigned int)avg * avg) >> 7));
}
DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
unsigned int vp9_sub_pixel_variance4x4_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse)
{
int xsum;
unsigned int xxsum;
vp9_filter_block2d_bil4x4_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
}
unsigned int vp9_sub_pixel_variance8x8_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum;
unsigned int xxsum;
vp9_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
}
unsigned int vp9_sub_pixel_variance16x16_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
vp9_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum0, &xxsum0
);
vp9_filter_block2d_bil_var_mmx(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}
unsigned int vp9_sub_pixel_mse16x16_mmx(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
return *sse;
}
unsigned int vp9_sub_pixel_variance16x8_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
vp9_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum0, &xxsum0
);
vp9_filter_block2d_bil_var_mmx(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
}
unsigned int vp9_sub_pixel_variance8x16_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum;
unsigned int xxsum;
vp9_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
}
unsigned int vp9_variance_halfpixvar16x16_h_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
ref_ptr, recon_stride, sse);
}
unsigned int vp9_variance_halfpixvar16x16_v_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
ref_ptr, recon_stride, sse);
}
unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
ref_ptr, recon_stride, sse);
}

View File

@ -9,29 +9,11 @@
*/
#include "vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
#define HALFNDX 8
extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
extern void vp9_filter_block2d_bil4x4_var_mmx
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
const short *HFilter,
const short *VFilter,
int *sum,
unsigned int *sumsquared
);
extern unsigned int vp9_get4x4var_mmx
(
const unsigned char *src_ptr,
@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2
unsigned int *SSE,
int *Sum
);
void vp9_filter_block2d_bil_var_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int xoffset,
int yoffset,
int *sum,
unsigned int *sumsquared
);
void vp9_half_horiz_vert_variance8x_h_sse2
(
const unsigned char *ref_ptr,
@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2
unsigned int *sumsquared
);
DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
typedef unsigned int (*get_var_sse2) (
const unsigned char *src_ptr,
int source_stride,
@ -375,347 +343,89 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
return (var - (((int64_t)avg * avg) >> 11));
}
unsigned int vp9_sub_pixel_variance4x4_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum;
unsigned int xxsum;
vp9_filter_block2d_bil4x4_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
#define DECLS(opt1, opt2) \
int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse); \
int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse); \
int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst, \
int dst_stride, \
unsigned int *sse_ptr) { \
unsigned int sse; \
int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
FN(16, 8, 16, 4, 3, opt1,); \
FN(8, 16, 8, 3, 4, opt1,); \
FN(8, 8, 8, 3, 3, opt1,); \
FN(8, 4, 8, 3, 2, opt1,); \
FN(4, 8, 4, 2, 3, opt2,); \
FN(4, 4, 4, 2, 2, opt2,)
unsigned int vp9_sub_pixel_variance8x8_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum;
unsigned int xxsum;
if (xoffset == HALFNDX && yoffset == 0) {
vp9_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
} else if (xoffset == 0 && yoffset == HALFNDX) {
vp9_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
vp9_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
} else {
vp9_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum, &xxsum);
}
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
}
static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse, int *avg) {
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
// note we could avoid these if statements if the calling function
// just called the appropriate functions inside.
if (xoffset == HALFNDX && yoffset == 0) {
vp9_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
} else if (xoffset == 0 && yoffset == HALFNDX) {
vp9_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
vp9_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
} else {
vp9_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum0, &xxsum0
);
vp9_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
*sse = xxsum0;
*avg = xsum0;
}
unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse_ptr) {
int avg;
unsigned int sse;
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line,
&sse, &avg);
*sse_ptr = sse;
return (sse - (((unsigned int) avg * avg) >> 8));
}
unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse_ptr) {
int avg0, avg1, avg2, avg3;
unsigned int sse0, sse1, sse2, sse3;
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line,
&sse0, &avg0);
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 16, dst_pixels_per_line,
&sse1, &avg1);
src_ptr += 16 * src_pixels_per_line;
dst_ptr += 16 * dst_pixels_per_line;
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line,
&sse2, &avg2);
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 16, dst_pixels_per_line,
&sse3, &avg3);
sse0 += sse1 + sse2 + sse3;
avg0 += avg1 + avg2 + avg3;
*sse_ptr = sse0;
return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
}
unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse_ptr) {
int avg0, avg1, avg2, avg3, avg4;
unsigned int sse0, sse1, sse2, sse3, sse4;
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line,
&sse0, &avg0);
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 16, dst_pixels_per_line,
&sse1, &avg1);
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 32, dst_pixels_per_line,
&sse2, &avg2);
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 48, dst_pixels_per_line,
&sse3, &avg3);
src_ptr += 16 * src_pixels_per_line;
dst_ptr += 16 * dst_pixels_per_line;
avg0 += avg1 + avg2 + avg3;
sse0 += sse1 + sse2 + sse3;
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line,
&sse1, &avg1);
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 16, dst_pixels_per_line,
&sse2, &avg2);
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 32, dst_pixels_per_line,
&sse3, &avg3);
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 48, dst_pixels_per_line,
&sse4, &avg4);
src_ptr += 16 * src_pixels_per_line;
dst_ptr += 16 * dst_pixels_per_line;
avg0 += avg1 + avg2 + avg3 + avg4;
sse0 += sse1 + sse2 + sse3 + sse4;
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line,
&sse1, &avg1);
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 16, dst_pixels_per_line,
&sse2, &avg2);
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 32, dst_pixels_per_line,
&sse3, &avg3);
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 48, dst_pixels_per_line,
&sse4, &avg4);
src_ptr += 16 * src_pixels_per_line;
dst_ptr += 16 * dst_pixels_per_line;
avg0 += avg1 + avg2 + avg3 + avg4;
sse0 += sse1 + sse2 + sse3 + sse4;
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line,
&sse1, &avg1);
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 16, dst_pixels_per_line,
&sse2, &avg2);
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 32, dst_pixels_per_line,
&sse3, &avg3);
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
yoffset, dst_ptr + 48, dst_pixels_per_line,
&sse4, &avg4);
avg0 += avg1 + avg2 + avg3 + avg4;
sse0 += sse1 + sse2 + sse3 + sse4;
*sse_ptr = sse0;
return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
}
unsigned int vp9_sub_pixel_mse16x16_sse2(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
yoffset, dst_ptr, dst_pixels_per_line, sse);
return *sse;
}
unsigned int vp9_sub_pixel_variance16x8_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
if (xoffset == HALFNDX && yoffset == 0) {
vp9_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
} else if (xoffset == 0 && yoffset == HALFNDX) {
vp9_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
vp9_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
} else {
vp9_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum0, &xxsum0);
vp9_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
}
unsigned int vp9_sub_pixel_variance8x16_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum;
unsigned int xxsum;
if (xoffset == HALFNDX && yoffset == 0) {
vp9_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
} else if (xoffset == 0 && yoffset == HALFNDX) {
vp9_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
vp9_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
} else {
vp9_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum, &xxsum);
}
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
}
FNS(sse2, sse);
FNS(ssse3, ssse3);
#undef FNS
#undef FN
unsigned int vp9_variance_halfpixvar16x16_h_wmt(
const unsigned char *src_ptr,

View File

@ -1,142 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
#define HALFNDX 8
extern void vp9_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp9_half_horiz_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp9_half_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp9_filter_block2d_bil_var_ssse3
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int xoffset,
int yoffset,
int *sum,
unsigned int *sumsquared
);
unsigned int vp9_sub_pixel_variance16x16_ssse3
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum0;
unsigned int xxsum0;
// note we could avoid these if statements if the calling function
// just called the appropriate functions inside.
if (xoffset == HALFNDX && yoffset == 0) {
vp9_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
} else if (xoffset == 0 && yoffset == HALFNDX) {
vp9_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
vp9_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
} else {
vp9_filter_block2d_bil_var_ssse3(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum0, &xxsum0);
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}
unsigned int vp9_sub_pixel_variance16x8_ssse3
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
) {
int xsum0;
unsigned int xxsum0;
if (xoffset == HALFNDX && yoffset == 0) {
vp9_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
} else if (xoffset == 0 && yoffset == HALFNDX) {
vp9_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
vp9_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
} else {
vp9_filter_block2d_bil_var_ssse3(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum0, &xxsum0);
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
}

View File

@ -85,13 +85,12 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm