Merge "Implement sse2 and ssse3 versions for all sub_pixel_variance sizes."
This commit is contained in:
commit
e6cd5ed307
@ -26,12 +26,55 @@ extern "C" {
|
||||
# include "vp9_rtcd.h"
|
||||
#endif
|
||||
}
|
||||
#include "test/acm_random.h"
|
||||
|
||||
namespace {
|
||||
|
||||
using ::std::tr1::get;
|
||||
using ::std::tr1::make_tuple;
|
||||
using ::std::tr1::tuple;
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
|
||||
int l2w, int l2h, unsigned int *sse_ptr) {
|
||||
int se = 0;
|
||||
unsigned int sse = 0;
|
||||
const int w = 1 << l2w, h = 1 << l2h;
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
int diff = ref[w * y + x] - src[w * y + x];
|
||||
se += diff;
|
||||
sse += diff * diff;
|
||||
}
|
||||
}
|
||||
*sse_ptr = sse;
|
||||
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
||||
}
|
||||
|
||||
static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
|
||||
int l2w, int l2h, int xoff, int yoff,
|
||||
unsigned int *sse_ptr) {
|
||||
int se = 0;
|
||||
unsigned int sse = 0;
|
||||
const int w = 1 << l2w, h = 1 << l2h;
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
// bilinear interpolation at a 16th pel step
|
||||
const int a1 = ref[(w + 1) * (y + 0) + x + 0];
|
||||
const int a2 = ref[(w + 1) * (y + 0) + x + 1];
|
||||
const int b1 = ref[(w + 1) * (y + 1) + x + 0];
|
||||
const int b2 = ref[(w + 1) * (y + 1) + x + 1];
|
||||
const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
|
||||
const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
|
||||
const int r = a + (((b - a) * yoff + 8) >> 4);
|
||||
int diff = r - src[w * y + x];
|
||||
se += diff;
|
||||
sse += diff * diff;
|
||||
}
|
||||
}
|
||||
*sse_ptr = sse;
|
||||
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
||||
}
|
||||
|
||||
template<typename VarianceFunctionType>
|
||||
class VarianceTest :
|
||||
@ -39,10 +82,13 @@ class VarianceTest :
|
||||
public:
|
||||
virtual void SetUp() {
|
||||
const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
|
||||
width_ = get<0>(params);
|
||||
height_ = get<1>(params);
|
||||
log2width_ = get<0>(params);
|
||||
width_ = 1 << log2width_;
|
||||
log2height_ = get<1>(params);
|
||||
height_ = 1 << log2height_;
|
||||
variance_ = get<2>(params);
|
||||
|
||||
rnd(ACMRandom::DeterministicSeed());
|
||||
block_size_ = width_ * height_;
|
||||
src_ = new uint8_t[block_size_];
|
||||
ref_ = new uint8_t[block_size_];
|
||||
@ -58,15 +104,16 @@ class VarianceTest :
|
||||
|
||||
protected:
|
||||
void ZeroTest();
|
||||
void RefTest();
|
||||
void OneQuarterTest();
|
||||
|
||||
ACMRandom rnd;
|
||||
uint8_t* src_;
|
||||
uint8_t* ref_;
|
||||
int width_;
|
||||
int height_;
|
||||
int width_, log2width_;
|
||||
int height_, log2height_;
|
||||
int block_size_;
|
||||
VarianceFunctionType variance_;
|
||||
|
||||
};
|
||||
|
||||
template<typename VarianceFunctionType>
|
||||
@ -82,6 +129,22 @@ void VarianceTest<VarianceFunctionType>::ZeroTest() {
|
||||
}
|
||||
}
|
||||
|
||||
template<typename VarianceFunctionType>
|
||||
void VarianceTest<VarianceFunctionType>::RefTest() {
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
for (int j = 0; j < block_size_; j++) {
|
||||
src_[j] = rnd.Rand8();
|
||||
ref_[j] = rnd.Rand8();
|
||||
}
|
||||
unsigned int sse1, sse2;
|
||||
const unsigned int var1 = variance_(src_, width_, ref_, width_, &sse1);
|
||||
const unsigned int var2 = variance_ref(src_, ref_, log2width_,
|
||||
log2height_, &sse2);
|
||||
EXPECT_EQ(sse1, sse2);
|
||||
EXPECT_EQ(var1, var2);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename VarianceFunctionType>
|
||||
void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
|
||||
memset(src_, 255, block_size_);
|
||||
@ -94,6 +157,66 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
|
||||
EXPECT_EQ(expected, var);
|
||||
}
|
||||
|
||||
template<typename SubpelVarianceFunctionType>
|
||||
class SubpelVarianceTest :
|
||||
public ::testing::TestWithParam<tuple<int, int,
|
||||
SubpelVarianceFunctionType> > {
|
||||
public:
|
||||
virtual void SetUp() {
|
||||
const tuple<int, int, SubpelVarianceFunctionType>& params =
|
||||
this->GetParam();
|
||||
log2width_ = get<0>(params);
|
||||
width_ = 1 << log2width_;
|
||||
log2height_ = get<1>(params);
|
||||
height_ = 1 << log2height_;
|
||||
subpel_variance_ = get<2>(params);
|
||||
|
||||
rnd(ACMRandom::DeterministicSeed());
|
||||
block_size_ = width_ * height_;
|
||||
src_ = new uint8_t[block_size_];
|
||||
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
|
||||
ASSERT_TRUE(src_ != NULL);
|
||||
ASSERT_TRUE(ref_ != NULL);
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
delete[] src_;
|
||||
delete[] ref_;
|
||||
}
|
||||
|
||||
protected:
|
||||
void RefTest();
|
||||
|
||||
ACMRandom rnd;
|
||||
uint8_t* src_;
|
||||
uint8_t* ref_;
|
||||
int width_, log2width_;
|
||||
int height_, log2height_;
|
||||
int block_size_;
|
||||
SubpelVarianceFunctionType subpel_variance_;
|
||||
};
|
||||
|
||||
template<typename SubpelVarianceFunctionType>
|
||||
void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
|
||||
for (int x = 0; x < 16; ++x) {
|
||||
for (int y = 0; y < 16; ++y) {
|
||||
for (int j = 0; j < block_size_; j++) {
|
||||
src_[j] = rnd.Rand8();
|
||||
}
|
||||
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
|
||||
ref_[j] = rnd.Rand8();
|
||||
}
|
||||
unsigned int sse1, sse2;
|
||||
const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
|
||||
src_, width_, &sse1);
|
||||
const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
|
||||
log2height_, x, y, &sse2);
|
||||
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
|
||||
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// VP8 test cases.
|
||||
|
||||
@ -103,6 +226,7 @@ namespace vp8 {
|
||||
typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
|
||||
|
||||
TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
|
||||
TEST_P(VP8VarianceTest, Ref) { RefTest(); }
|
||||
TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
|
||||
|
||||
const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
|
||||
@ -112,11 +236,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
|
||||
const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VP8VarianceTest,
|
||||
::testing::Values(make_tuple(4, 4, variance4x4_c),
|
||||
make_tuple(8, 8, variance8x8_c),
|
||||
make_tuple(8, 16, variance8x16_c),
|
||||
make_tuple(16, 8, variance16x8_c),
|
||||
make_tuple(16, 16, variance16x16_c)));
|
||||
::testing::Values(make_tuple(2, 2, variance4x4_c),
|
||||
make_tuple(3, 3, variance8x8_c),
|
||||
make_tuple(3, 4, variance8x16_c),
|
||||
make_tuple(4, 3, variance16x8_c),
|
||||
make_tuple(4, 4, variance16x16_c)));
|
||||
|
||||
#if HAVE_MMX
|
||||
const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
|
||||
@ -126,11 +250,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
|
||||
const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MMX, VP8VarianceTest,
|
||||
::testing::Values(make_tuple(4, 4, variance4x4_mmx),
|
||||
make_tuple(8, 8, variance8x8_mmx),
|
||||
make_tuple(8, 16, variance8x16_mmx),
|
||||
make_tuple(16, 8, variance16x8_mmx),
|
||||
make_tuple(16, 16, variance16x16_mmx)));
|
||||
::testing::Values(make_tuple(2, 2, variance4x4_mmx),
|
||||
make_tuple(3, 3, variance8x8_mmx),
|
||||
make_tuple(3, 4, variance8x16_mmx),
|
||||
make_tuple(4, 3, variance16x8_mmx),
|
||||
make_tuple(4, 4, variance16x16_mmx)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
@ -141,11 +265,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
|
||||
const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, VP8VarianceTest,
|
||||
::testing::Values(make_tuple(4, 4, variance4x4_wmt),
|
||||
make_tuple(8, 8, variance8x8_wmt),
|
||||
make_tuple(8, 16, variance8x16_wmt),
|
||||
make_tuple(16, 8, variance16x8_wmt),
|
||||
make_tuple(16, 16, variance16x16_wmt)));
|
||||
::testing::Values(make_tuple(2, 2, variance4x4_wmt),
|
||||
make_tuple(3, 3, variance8x8_wmt),
|
||||
make_tuple(3, 4, variance8x16_wmt),
|
||||
make_tuple(4, 3, variance16x8_wmt),
|
||||
make_tuple(4, 4, variance16x16_wmt)));
|
||||
#endif
|
||||
#endif // CONFIG_VP8_ENCODER
|
||||
|
||||
@ -158,22 +282,83 @@ namespace vp9 {
|
||||
|
||||
#if CONFIG_VP9_ENCODER
|
||||
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
|
||||
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
|
||||
|
||||
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
|
||||
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
|
||||
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
|
||||
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
|
||||
|
||||
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
|
||||
const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
|
||||
const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
|
||||
const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
|
||||
const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
|
||||
const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
|
||||
const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
|
||||
const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
|
||||
const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
|
||||
const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
|
||||
const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
|
||||
const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
|
||||
const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VP9VarianceTest,
|
||||
::testing::Values(make_tuple(4, 4, variance4x4_c),
|
||||
make_tuple(8, 8, variance8x8_c),
|
||||
make_tuple(8, 16, variance8x16_c),
|
||||
make_tuple(16, 8, variance16x8_c),
|
||||
make_tuple(16, 16, variance16x16_c)));
|
||||
::testing::Values(make_tuple(2, 2, variance4x4_c),
|
||||
make_tuple(2, 3, variance4x8_c),
|
||||
make_tuple(3, 2, variance8x4_c),
|
||||
make_tuple(3, 3, variance8x8_c),
|
||||
make_tuple(3, 4, variance8x16_c),
|
||||
make_tuple(4, 3, variance16x8_c),
|
||||
make_tuple(4, 4, variance16x16_c),
|
||||
make_tuple(4, 5, variance16x32_c),
|
||||
make_tuple(5, 4, variance32x16_c),
|
||||
make_tuple(5, 5, variance32x32_c),
|
||||
make_tuple(5, 6, variance32x64_c),
|
||||
make_tuple(6, 5, variance64x32_c),
|
||||
make_tuple(6, 6, variance64x64_c)));
|
||||
|
||||
const vp9_subpixvariance_fn_t subpel_variance4x4_c =
|
||||
vp9_sub_pixel_variance4x4_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance4x8_c =
|
||||
vp9_sub_pixel_variance4x8_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x4_c =
|
||||
vp9_sub_pixel_variance8x4_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x8_c =
|
||||
vp9_sub_pixel_variance8x8_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x16_c =
|
||||
vp9_sub_pixel_variance8x16_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x8_c =
|
||||
vp9_sub_pixel_variance16x8_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x16_c =
|
||||
vp9_sub_pixel_variance16x16_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x32_c =
|
||||
vp9_sub_pixel_variance16x32_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x16_c =
|
||||
vp9_sub_pixel_variance32x16_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x32_c =
|
||||
vp9_sub_pixel_variance32x32_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x64_c =
|
||||
vp9_sub_pixel_variance32x64_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance64x32_c =
|
||||
vp9_sub_pixel_variance64x32_c;
|
||||
const vp9_subpixvariance_fn_t subpel_variance64x64_c =
|
||||
vp9_sub_pixel_variance64x64_c;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VP9SubpelVarianceTest,
|
||||
::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
|
||||
make_tuple(2, 3, subpel_variance4x8_c),
|
||||
make_tuple(3, 2, subpel_variance8x4_c),
|
||||
make_tuple(3, 3, subpel_variance8x8_c),
|
||||
make_tuple(3, 4, subpel_variance8x16_c),
|
||||
make_tuple(4, 3, subpel_variance16x8_c),
|
||||
make_tuple(4, 4, subpel_variance16x16_c),
|
||||
make_tuple(4, 5, subpel_variance16x32_c),
|
||||
make_tuple(5, 4, subpel_variance32x16_c),
|
||||
make_tuple(5, 5, subpel_variance32x32_c),
|
||||
make_tuple(5, 6, subpel_variance32x64_c),
|
||||
make_tuple(6, 5, subpel_variance64x32_c),
|
||||
make_tuple(6, 6, subpel_variance64x64_c)));
|
||||
|
||||
#if HAVE_MMX
|
||||
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
|
||||
@ -183,26 +368,128 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
|
||||
const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MMX, VP9VarianceTest,
|
||||
::testing::Values(make_tuple(4, 4, variance4x4_mmx),
|
||||
make_tuple(8, 8, variance8x8_mmx),
|
||||
make_tuple(8, 16, variance8x16_mmx),
|
||||
make_tuple(16, 8, variance16x8_mmx),
|
||||
make_tuple(16, 16, variance16x16_mmx)));
|
||||
::testing::Values(make_tuple(2, 2, variance4x4_mmx),
|
||||
make_tuple(3, 3, variance8x8_mmx),
|
||||
make_tuple(3, 4, variance8x16_mmx),
|
||||
make_tuple(4, 3, variance16x8_mmx),
|
||||
make_tuple(4, 4, variance16x16_mmx)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2;
|
||||
const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2;
|
||||
const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2;
|
||||
const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2;
|
||||
const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2;
|
||||
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
|
||||
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
|
||||
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
|
||||
const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
|
||||
const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
|
||||
const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
|
||||
const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
|
||||
const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
|
||||
const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
|
||||
const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
|
||||
const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
|
||||
const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
|
||||
const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, VP9VarianceTest,
|
||||
::testing::Values(make_tuple(4, 4, variance4x4_wmt),
|
||||
make_tuple(8, 8, variance8x8_wmt),
|
||||
make_tuple(8, 16, variance8x16_wmt),
|
||||
make_tuple(16, 8, variance16x8_wmt),
|
||||
make_tuple(16, 16, variance16x16_wmt)));
|
||||
::testing::Values(make_tuple(2, 2, variance4x4_sse2),
|
||||
make_tuple(2, 3, variance4x8_sse2),
|
||||
make_tuple(3, 2, variance8x4_sse2),
|
||||
make_tuple(3, 3, variance8x8_sse2),
|
||||
make_tuple(3, 4, variance8x16_sse2),
|
||||
make_tuple(4, 3, variance16x8_sse2),
|
||||
make_tuple(4, 4, variance16x16_sse2),
|
||||
make_tuple(4, 5, variance16x32_sse2),
|
||||
make_tuple(5, 4, variance32x16_sse2),
|
||||
make_tuple(5, 5, variance32x32_sse2),
|
||||
make_tuple(5, 6, variance32x64_sse2),
|
||||
make_tuple(6, 5, variance64x32_sse2),
|
||||
make_tuple(6, 6, variance64x64_sse2)));
|
||||
|
||||
const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
|
||||
vp9_sub_pixel_variance4x4_sse;
|
||||
const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
|
||||
vp9_sub_pixel_variance4x8_sse;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 =
|
||||
vp9_sub_pixel_variance8x4_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 =
|
||||
vp9_sub_pixel_variance8x8_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 =
|
||||
vp9_sub_pixel_variance8x16_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 =
|
||||
vp9_sub_pixel_variance16x8_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 =
|
||||
vp9_sub_pixel_variance16x16_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 =
|
||||
vp9_sub_pixel_variance16x32_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 =
|
||||
vp9_sub_pixel_variance32x16_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 =
|
||||
vp9_sub_pixel_variance32x32_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 =
|
||||
vp9_sub_pixel_variance32x64_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 =
|
||||
vp9_sub_pixel_variance64x32_sse2;
|
||||
const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 =
|
||||
vp9_sub_pixel_variance64x64_sse2;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, VP9SubpelVarianceTest,
|
||||
::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
|
||||
make_tuple(2, 3, subpel_variance4x8_sse),
|
||||
make_tuple(3, 2, subpel_variance8x4_sse2),
|
||||
make_tuple(3, 3, subpel_variance8x8_sse2),
|
||||
make_tuple(3, 4, subpel_variance8x16_sse2),
|
||||
make_tuple(4, 3, subpel_variance16x8_sse2),
|
||||
make_tuple(4, 4, subpel_variance16x16_sse2),
|
||||
make_tuple(4, 5, subpel_variance16x32_sse2),
|
||||
make_tuple(5, 4, subpel_variance32x16_sse2),
|
||||
make_tuple(5, 5, subpel_variance32x32_sse2),
|
||||
make_tuple(5, 6, subpel_variance32x64_sse2),
|
||||
make_tuple(6, 5, subpel_variance64x32_sse2),
|
||||
make_tuple(6, 6, subpel_variance64x64_sse2)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSSE3
|
||||
const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
|
||||
vp9_sub_pixel_variance4x4_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
|
||||
vp9_sub_pixel_variance4x8_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 =
|
||||
vp9_sub_pixel_variance8x4_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 =
|
||||
vp9_sub_pixel_variance8x8_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 =
|
||||
vp9_sub_pixel_variance8x16_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 =
|
||||
vp9_sub_pixel_variance16x8_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 =
|
||||
vp9_sub_pixel_variance16x16_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 =
|
||||
vp9_sub_pixel_variance16x32_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 =
|
||||
vp9_sub_pixel_variance32x16_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 =
|
||||
vp9_sub_pixel_variance32x32_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 =
|
||||
vp9_sub_pixel_variance32x64_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 =
|
||||
vp9_sub_pixel_variance64x32_ssse3;
|
||||
const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 =
|
||||
vp9_sub_pixel_variance64x64_ssse3;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSSE3, VP9SubpelVarianceTest,
|
||||
::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
|
||||
make_tuple(2, 3, subpel_variance4x8_ssse3),
|
||||
make_tuple(3, 2, subpel_variance8x4_ssse3),
|
||||
make_tuple(3, 3, subpel_variance8x8_ssse3),
|
||||
make_tuple(3, 4, subpel_variance8x16_ssse3),
|
||||
make_tuple(4, 3, subpel_variance16x8_ssse3),
|
||||
make_tuple(4, 4, subpel_variance16x16_ssse3),
|
||||
make_tuple(4, 5, subpel_variance16x32_ssse3),
|
||||
make_tuple(5, 4, subpel_variance32x16_ssse3),
|
||||
make_tuple(5, 5, subpel_variance32x32_ssse3),
|
||||
make_tuple(5, 6, subpel_variance32x64_ssse3),
|
||||
make_tuple(6, 5, subpel_variance64x32_ssse3),
|
||||
make_tuple(6, 6, subpel_variance64x64_ssse3)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_ENCODER
|
||||
|
||||
|
@ -266,85 +266,81 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
|
||||
specialize vp9_variance4x4 mmx sse2
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance64x64 sse2
|
||||
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance64x64
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance32x64
|
||||
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance32x64
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance64x32
|
||||
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance64x32
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance32x16
|
||||
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance32x16
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance16x32
|
||||
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance16x32
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance32x32 sse2
|
||||
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance32x32
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
|
||||
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance16x16
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance8x16 sse2 mmx
|
||||
vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
|
||||
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance8x16
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
|
||||
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
|
||||
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
|
||||
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance16x8
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance8x8 sse2 mmx
|
||||
vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
|
||||
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance8x8
|
||||
|
||||
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
|
||||
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance8x4
|
||||
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance8x4
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance4x8
|
||||
specialize vp9_sub_pixel_variance4x8 sse ssse3
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance4x8
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_variance4x4 sse2 mmx
|
||||
vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
|
||||
specialize vp9_sub_pixel_variance4x4 sse ssse3
|
||||
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||
specialize vp9_sub_pixel_avg_variance4x4
|
||||
@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co
|
||||
specialize vp9_sad4x4 mmx sse
|
||||
|
||||
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_variance_halfpixvar16x16_h mmx sse2
|
||||
specialize vp9_variance_halfpixvar16x16_h sse2
|
||||
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
|
||||
|
||||
prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_variance_halfpixvar16x16_v mmx sse2
|
||||
specialize vp9_variance_halfpixvar16x16_v sse2
|
||||
vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
|
||||
|
||||
prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_variance_halfpixvar16x16_hv mmx sse2
|
||||
specialize vp9_variance_halfpixvar16x16_hv sse2
|
||||
vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
|
||||
|
||||
prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse
|
||||
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
|
||||
specialize vp9_sad4x4x4d sse
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_mse16x16 sse2 mmx
|
||||
#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
|
||||
#specialize vp9_sub_pixel_mse16x16 sse2 mmx
|
||||
|
||||
prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"
|
||||
specialize vp9_mse16x16 mmx sse2
|
||||
|
1061
vp9/encoder/x86/vp9_subpel_variance.asm
Normal file
1061
vp9/encoder/x86/vp9_subpel_variance.asm
Normal file
File diff suppressed because it is too large
Load Diff
@ -8,292 +8,8 @@
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define xmm_filter_shift 7
|
||||
|
||||
;void vp9_filter_block2d_bil_var_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int xoffset,
|
||||
; int yoffset,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared;;
|
||||
;
|
||||
;)
|
||||
global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
|
||||
sym(vp9_filter_block2d_bil_var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
pxor xmm6, xmm6 ;
|
||||
pxor xmm7, xmm7 ;
|
||||
|
||||
lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
|
||||
movdqa xmm4, XMMWORD PTR [rsi]
|
||||
|
||||
lea rcx, [GLOBAL(bilinear_filters_sse2)]
|
||||
movsxd rax, dword ptr arg(5) ; xoffset
|
||||
|
||||
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
||||
je filter_block2d_bil_var_sse2_sp_only
|
||||
|
||||
shl rax, 5 ; point to filter coeff with xoffset
|
||||
lea rax, [rax + rcx] ; HFilter
|
||||
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; skip second_pass filter if yoffset=0
|
||||
je filter_block2d_bil_var_sse2_fp_only
|
||||
|
||||
shl rdx, 5
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
movq xmm3, QWORD PTR [rsi+1] ;
|
||||
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
pmullw xmm1, [rax] ;
|
||||
punpcklbw xmm3, xmm0
|
||||
pmullw xmm3, [rax+16] ;
|
||||
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
movdqa xmm5, xmm1
|
||||
|
||||
movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
|
||||
lea rsi, [rsi + rbx]
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
filter_block2d_bil_var_sse2_loop:
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
movq xmm3, QWORD PTR [rsi+1] ;
|
||||
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
pmullw xmm1, [rax] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
pmullw xmm3, [rax+16] ;
|
||||
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movdqa xmm3, xmm5 ;
|
||||
movdqa xmm5, xmm1 ;
|
||||
|
||||
pmullw xmm3, [rdx] ;
|
||||
pmullw xmm1, [rdx+16] ;
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm3 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
|
||||
lea rsi, [rsi + rbx] ;ref_pixels_per_line
|
||||
%if ABI_IS_32BIT
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_var_sse2_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_sse2_sp_only:
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
|
||||
je filter_block2d_bil_var_sse2_full_pixel
|
||||
|
||||
shl rdx, 5
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
|
||||
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
|
||||
lea rsi, [rsi + rax]
|
||||
|
||||
filter_block2d_bil_sp_only_loop:
|
||||
movq xmm3, QWORD PTR [rsi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
movdqa xmm5, xmm3
|
||||
|
||||
pmullw xmm1, [rdx] ;
|
||||
pmullw xmm3, [rdx+16] ;
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm3 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
|
||||
movdqa xmm1, xmm5 ;
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
lea rdi, [rdi + rbx] ;src_pixels_per_line
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_sp_only_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_sse2_full_pixel:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
filter_block2d_bil_full_pixel_loop:
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
|
||||
movq xmm2, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm2, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm2 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
lea rdi, [rdi + rbx] ;src_pixels_per_line
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_full_pixel_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_sse2_fp_only:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
|
||||
|
||||
filter_block2d_bil_fp_only_loop:
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
movq xmm3, QWORD PTR [rsi+1] ;
|
||||
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
pmullw xmm1, [rax] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
pmullw xmm3, [rax+16] ;
|
||||
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm3 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
lea rsi, [rsi + rdx]
|
||||
lea rdi, [rdi + rbx] ;src_pixels_per_line
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_fp_only_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_variance:
|
||||
movdq2q mm6, xmm6 ;
|
||||
movdq2q mm7, xmm7 ;
|
||||
|
||||
psrldq xmm6, 8
|
||||
psrldq xmm7, 8
|
||||
|
||||
movdq2q mm2, xmm6
|
||||
movdq2q mm3, xmm7
|
||||
|
||||
paddw mm6, mm2
|
||||
paddd mm7, mm3
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rsi, arg(7) ; sum
|
||||
mov rdi, arg(8) ; sumsquared
|
||||
|
||||
movd [rsi], mm2 ; xsum
|
||||
movd [rdi], mm4 ; xxsum
|
||||
|
||||
; begin epilog
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
;void vp9_half_horiz_vert_variance16x_h_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
|
||||
align 16
|
||||
xmm_bi_rd:
|
||||
times 8 dw 64
|
||||
align 16
|
||||
bilinear_filters_sse2:
|
||||
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
|
||||
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
|
||||
dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
|
||||
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
|
||||
dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
|
||||
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
|
||||
dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
|
||||
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
|
||||
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
|
||||
dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
|
||||
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
|
||||
dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
|
||||
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
|
||||
dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
|
||||
|
@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%define mmx_filter_shift 7
|
||||
|
||||
;void vp9_filter_block2d_bil4x4_var_mmx
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned short *HFilter,
|
||||
; unsigned short *VFilter,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
|
||||
sym(vp9_filter_block2d_bil4x4_var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 8
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm6, mm6 ;
|
||||
pxor mm7, mm7 ;
|
||||
|
||||
mov rax, arg(4) ;HFilter ;
|
||||
mov rdx, arg(5) ;VFilter ;
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
|
||||
mov rcx, 4 ;
|
||||
pxor mm0, mm0 ;
|
||||
|
||||
movd mm1, [rsi] ;
|
||||
movd mm3, [rsi+1] ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
movq mm5, mm1
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rsi, r8
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil4x4_var_mmx_loop:
|
||||
|
||||
movd mm1, [rsi] ;
|
||||
movd mm3, [rsi+1] ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
movq mm3, mm5 ;
|
||||
|
||||
movq mm5, mm1 ;
|
||||
pmullw mm3, [rdx] ;
|
||||
|
||||
pmullw mm1, [rdx+8] ;
|
||||
paddw mm1, mm3 ;
|
||||
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
|
||||
movd mm3, [rdi] ;
|
||||
punpcklbw mm3, mm0 ;
|
||||
|
||||
psubw mm1, mm3 ;
|
||||
paddw mm6, mm1 ;
|
||||
|
||||
pmaddwd mm1, mm1 ;
|
||||
paddd mm7, mm1 ;
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz .filter_block2d_bil4x4_var_mmx_loop ;
|
||||
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rdi, arg(6) ;sum
|
||||
mov rsi, arg(7) ;sumsquared
|
||||
|
||||
movd dword ptr [rdi], mm2 ;
|
||||
movd dword ptr [rsi], mm4 ;
|
||||
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
;void vp9_filter_block2d_bil_var_mmx
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; unsigned short *HFilter,
|
||||
; unsigned short *VFilter,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
|
||||
sym(vp9_filter_block2d_bil_var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
pxor mm6, mm6 ;
|
||||
pxor mm7, mm7 ;
|
||||
mov rax, arg(5) ;HFilter ;
|
||||
|
||||
mov rdx, arg(6) ;VFilter ;
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
|
||||
pxor mm0, mm0 ;
|
||||
movq mm1, [rsi] ;
|
||||
|
||||
movq mm3, [rsi+1] ;
|
||||
movq mm2, mm1 ;
|
||||
|
||||
movq mm4, mm3 ;
|
||||
punpcklbw mm1, mm0 ;
|
||||
|
||||
punpckhbw mm2, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
pmullw mm2, [rax] ;
|
||||
punpcklbw mm3, mm0 ;
|
||||
|
||||
punpckhbw mm4, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
pmullw mm4, [rax+8] ;
|
||||
paddw mm1, mm3 ;
|
||||
|
||||
paddw mm2, mm4 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
movq mm5, mm1
|
||||
|
||||
packuswb mm5, mm2 ;
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
add rsi, r8
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil_var_mmx_loop:
|
||||
|
||||
movq mm1, [rsi] ;
|
||||
movq mm3, [rsi+1] ;
|
||||
|
||||
movq mm2, mm1 ;
|
||||
movq mm4, mm3 ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
punpckhbw mm2, mm0 ;
|
||||
|
||||
pmullw mm1, [rax] ;
|
||||
pmullw mm2, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
pmullw mm3, [rax+8] ;
|
||||
pmullw mm4, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm2, mm4 ;
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
|
||||
movq mm3, mm5 ;
|
||||
movq mm4, mm5 ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
movq mm5, mm1 ;
|
||||
packuswb mm5, mm2 ;
|
||||
|
||||
pmullw mm3, [rdx] ;
|
||||
pmullw mm4, [rdx] ;
|
||||
|
||||
pmullw mm1, [rdx+8] ;
|
||||
pmullw mm2, [rdx+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm2, mm4 ;
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
|
||||
movq mm3, [rdi] ;
|
||||
movq mm4, mm3 ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
psubw mm1, mm3 ;
|
||||
psubw mm2, mm4 ;
|
||||
|
||||
paddw mm6, mm1 ;
|
||||
pmaddwd mm1, mm1 ;
|
||||
|
||||
paddw mm6, mm2 ;
|
||||
pmaddwd mm2, mm2 ;
|
||||
|
||||
paddd mm7, mm1 ;
|
||||
paddd mm7, mm2 ;
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz .filter_block2d_bil_var_mmx_loop ;
|
||||
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rdi, arg(7) ;sum
|
||||
mov rsi, arg(8) ;sumsquared
|
||||
|
||||
movd dword ptr [rdi], mm2 ;
|
||||
movd dword ptr [rsi], mm4 ;
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
|
||||
align 16
|
||||
mmx_bi_rd:
|
||||
times 4 dw 64
|
||||
|
@ -11,8 +11,6 @@
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define xmm_filter_shift 7
|
||||
|
||||
;unsigned int vp9_get_mb_ss_sse2
|
||||
;(
|
||||
; short *src_ptr
|
||||
@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2):
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
|
||||
align 16
|
||||
xmm_bi_rd:
|
||||
times 8 dw 64
|
||||
align 16
|
||||
bilinear_filters_sse2:
|
||||
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
|
||||
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
|
||||
dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
|
||||
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
|
||||
dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
|
||||
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
|
||||
dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
|
||||
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
|
||||
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
|
||||
dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
|
||||
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
|
||||
dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
|
||||
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
|
||||
dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
|
||||
|
@ -1,372 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define xmm_filter_shift 7
|
||||
|
||||
|
||||
;void vp9_filter_block2d_bil_var_ssse3
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int xoffset,
|
||||
; int yoffset,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared;;
|
||||
;
|
||||
;)
|
||||
;Note: The filter coefficient at offset=0 is 128. Since the second register
|
||||
;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
|
||||
global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
|
||||
sym(vp9_filter_block2d_bil_var_ssse3):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
pxor xmm6, xmm6
|
||||
pxor xmm7, xmm7
|
||||
|
||||
lea rcx, [GLOBAL(bilinear_filters_ssse3)]
|
||||
movsxd rax, dword ptr arg(5) ; xoffset
|
||||
|
||||
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
||||
je .filter_block2d_bil_var_ssse3_sp_only
|
||||
|
||||
shl rax, 4 ; point to filter coeff with xoffset
|
||||
lea rax, [rax + rcx] ; HFilter
|
||||
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; skip second_pass filter if yoffset=0
|
||||
je .filter_block2d_bil_var_ssse3_fp_only
|
||||
|
||||
shl rdx, 4
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm2, xmm0
|
||||
|
||||
punpcklbw xmm0, xmm1
|
||||
punpckhbw xmm2, xmm1
|
||||
pmaddubsw xmm0, [rax]
|
||||
pmaddubsw xmm2, [rax]
|
||||
|
||||
paddw xmm0, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm2, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm0, xmm_filter_shift
|
||||
psraw xmm2, xmm_filter_shift
|
||||
|
||||
packuswb xmm0, xmm2
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
lea rsi, [rsi + r8]
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil_var_ssse3_loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm3, xmm1
|
||||
|
||||
punpcklbw xmm1, xmm2
|
||||
punpckhbw xmm3, xmm2
|
||||
pmaddubsw xmm1, [rax]
|
||||
pmaddubsw xmm3, [rax]
|
||||
|
||||
paddw xmm1, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm3, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm1, xmm_filter_shift
|
||||
psraw xmm3, xmm_filter_shift
|
||||
packuswb xmm1, xmm3
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm0, xmm1
|
||||
movdqa xmm3, xmm2
|
||||
|
||||
punpcklbw xmm2, xmm1
|
||||
punpckhbw xmm3, xmm1
|
||||
pmaddubsw xmm2, [rdx]
|
||||
pmaddubsw xmm3, [rdx]
|
||||
|
||||
paddw xmm2, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm3, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm2, xmm_filter_shift
|
||||
psraw xmm3, xmm_filter_shift
|
||||
|
||||
movq xmm1, QWORD PTR [rdi]
|
||||
pxor xmm4, xmm4
|
||||
punpcklbw xmm1, xmm4
|
||||
movq xmm5, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm5, xmm4
|
||||
|
||||
psubw xmm2, xmm1
|
||||
psubw xmm3, xmm5
|
||||
paddw xmm6, xmm2
|
||||
paddw xmm6, xmm3
|
||||
pmaddwd xmm2, xmm2
|
||||
pmaddwd xmm3, xmm3
|
||||
paddd xmm7, xmm2
|
||||
paddd xmm7, xmm3
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rsi, [rsi + r8]
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_var_ssse3_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_var_ssse3_sp_only:
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
|
||||
je .filter_block2d_bil_var_ssse3_full_pixel
|
||||
|
||||
shl rdx, 4
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqa xmm0, xmm1
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
lea rsi, [rsi + rax]
|
||||
|
||||
.filter_block2d_bil_sp_only_loop:
|
||||
movdqu xmm3, XMMWORD PTR [rsi]
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm0, xmm3
|
||||
|
||||
punpcklbw xmm1, xmm3
|
||||
punpckhbw xmm2, xmm3
|
||||
pmaddubsw xmm1, [rdx]
|
||||
pmaddubsw xmm2, [rdx]
|
||||
|
||||
paddw xmm1, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm2, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm1, xmm_filter_shift
|
||||
psraw xmm2, xmm_filter_shift
|
||||
|
||||
movq xmm3, QWORD PTR [rdi]
|
||||
pxor xmm4, xmm4
|
||||
punpcklbw xmm3, xmm4
|
||||
movq xmm5, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm5, xmm4
|
||||
|
||||
psubw xmm1, xmm3
|
||||
psubw xmm2, xmm5
|
||||
paddw xmm6, xmm1
|
||||
paddw xmm6, xmm2
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm7, xmm2
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_sp_only_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_var_ssse3_full_pixel:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
|
||||
pxor xmm0, xmm0
|
||||
|
||||
.filter_block2d_bil_full_pixel_loop:
|
||||
movq xmm1, QWORD PTR [rsi]
|
||||
punpcklbw xmm1, xmm0
|
||||
movq xmm2, QWORD PTR [rsi+8]
|
||||
punpcklbw xmm2, xmm0
|
||||
|
||||
movq xmm3, QWORD PTR [rdi]
|
||||
punpcklbw xmm3, xmm0
|
||||
movq xmm4, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm4, xmm0
|
||||
|
||||
psubw xmm1, xmm3
|
||||
psubw xmm2, xmm4
|
||||
paddw xmm6, xmm1
|
||||
paddw xmm6, xmm2
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm7, xmm2
|
||||
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
lea rdi, [rdi + rdx] ;src_pixels_per_line
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_full_pixel_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_var_ssse3_fp_only:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil_fp_only_loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm3, xmm1
|
||||
|
||||
punpcklbw xmm1, xmm2
|
||||
punpckhbw xmm3, xmm2
|
||||
pmaddubsw xmm1, [rax]
|
||||
pmaddubsw xmm3, [rax]
|
||||
|
||||
paddw xmm1, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm3, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm1, xmm_filter_shift
|
||||
psraw xmm3, xmm_filter_shift
|
||||
|
||||
movq xmm2, XMMWORD PTR [rdi]
|
||||
pxor xmm4, xmm4
|
||||
punpcklbw xmm2, xmm4
|
||||
movq xmm5, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm5, xmm4
|
||||
|
||||
psubw xmm1, xmm2
|
||||
psubw xmm3, xmm5
|
||||
paddw xmm6, xmm1
|
||||
paddw xmm6, xmm3
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm3, xmm3
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm7, xmm3
|
||||
|
||||
lea rsi, [rsi + rdx]
|
||||
%if ABI_IS_32BIT
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_fp_only_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_variance:
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm5, xmm5
|
||||
|
||||
punpcklwd xmm0, xmm6
|
||||
punpckhwd xmm1, xmm6
|
||||
psrad xmm0, 16
|
||||
psrad xmm1, 16
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
movdqa xmm6, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
punpckhdq xmm7, xmm5
|
||||
paddd xmm6, xmm7
|
||||
|
||||
punpckldq xmm0, xmm5
|
||||
punpckhdq xmm1, xmm5
|
||||
paddd xmm0, xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
psrldq xmm7, 8
|
||||
psrldq xmm1, 8
|
||||
|
||||
paddd xmm6, xmm7
|
||||
paddd xmm0, xmm1
|
||||
|
||||
mov rsi, arg(7) ;[Sum]
|
||||
mov rdi, arg(8) ;[SSE]
|
||||
|
||||
movd [rsi], xmm0
|
||||
movd [rdi], xmm6
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
xmm_bi_rd:
|
||||
times 8 dw 64
|
||||
align 16
|
||||
bilinear_filters_ssse3:
|
||||
times 8 db 128, 0
|
||||
times 8 db 120, 8
|
||||
times 8 db 112, 16
|
||||
times 8 db 104, 24
|
||||
times 8 db 96, 32
|
||||
times 8 db 88, 40
|
||||
times 8 db 80, 48
|
||||
times 8 db 72, 56
|
||||
times 8 db 64, 64
|
||||
times 8 db 56, 72
|
||||
times 8 db 48, 80
|
||||
times 8 db 40, 88
|
||||
times 8 db 32, 96
|
||||
times 8 db 24, 104
|
||||
times 8 db 16, 112
|
||||
times 8 db 8, 120
|
@ -13,27 +13,6 @@
|
||||
#include "vp9/common/vp9_pragmas.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
extern void filter_block1d_h6_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
unsigned short *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
short *vp7_filter
|
||||
);
|
||||
extern void filter_block1d_v6_mmx
|
||||
(
|
||||
const short *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
short *vp7_filter
|
||||
);
|
||||
|
||||
extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
|
||||
extern unsigned int vp9_get8x8var_mmx
|
||||
(
|
||||
@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
extern void vp9_filter_block2d_bil4x4_var_mmx
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const short *HFilter,
|
||||
const short *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp9_filter_block2d_bil_var_mmx
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
const short *HFilter,
|
||||
const short *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
|
||||
unsigned int vp9_variance4x4_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx(
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
|
||||
}
|
||||
|
||||
DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
|
||||
|
||||
unsigned int vp9_sub_pixel_variance4x4_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse)
|
||||
|
||||
{
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp9_filter_block2d_bil4x4_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_sub_pixel_variance8x8_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp9_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x16_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
vp9_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum0, &xxsum0
|
||||
);
|
||||
|
||||
vp9_filter_block2d_bil_var_mmx(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 16,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum1, &xxsum1
|
||||
);
|
||||
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
|
||||
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_mse16x16_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x8_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
|
||||
vp9_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum0, &xxsum0
|
||||
);
|
||||
|
||||
|
||||
vp9_filter_block2d_bil_var_mmx(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 8,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum1, &xxsum1
|
||||
);
|
||||
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance8x16_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp9_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_variance_halfpixvar16x16_h_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_variance_halfpixvar16x16_v_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
@ -9,29 +9,11 @@
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vp9/common/vp9_pragmas.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#define HALFNDX 8
|
||||
|
||||
extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
|
||||
extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
|
||||
extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
|
||||
extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
|
||||
|
||||
extern void vp9_filter_block2d_bil4x4_var_mmx
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const short *HFilter,
|
||||
const short *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
extern unsigned int vp9_get4x4var_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
void vp9_filter_block2d_bil_var_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
void vp9_half_horiz_vert_variance8x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
|
||||
|
||||
typedef unsigned int (*get_var_sse2) (
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
@ -375,347 +343,89 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
|
||||
return (var - (((int64_t)avg * avg) >> 11));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance4x4_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp9_filter_block2d_bil4x4_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line,
|
||||
vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
|
||||
#define DECLS(opt1, opt2) \
|
||||
int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse); \
|
||||
int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse); \
|
||||
int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
DECLS(ssse3, ssse3);
|
||||
#undef DECLS
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr) { \
|
||||
unsigned int sse; \
|
||||
int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
h, &sse); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
|
||||
FN(16, 8, 16, 4, 3, opt1,); \
|
||||
FN(8, 16, 8, 3, 4, opt1,); \
|
||||
FN(8, 8, 8, 3, 3, opt1,); \
|
||||
FN(8, 4, 8, 3, 2, opt1,); \
|
||||
FN(4, 8, 4, 2, 3, opt2,); \
|
||||
FN(4, 4, 4, 2, 2, opt2,)
|
||||
|
||||
unsigned int vp9_sub_pixel_variance8x8_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
|
||||
if (xoffset == HALFNDX && yoffset == 0) {
|
||||
vp9_half_horiz_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum, &xxsum);
|
||||
} else if (xoffset == 0 && yoffset == HALFNDX) {
|
||||
vp9_half_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum, &xxsum);
|
||||
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
|
||||
vp9_half_horiz_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum, &xxsum);
|
||||
} else {
|
||||
vp9_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
|
||||
}
|
||||
|
||||
static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const uint8_t *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse, int *avg) {
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
// note we could avoid these if statements if the calling function
|
||||
// just called the appropriate functions inside.
|
||||
if (xoffset == HALFNDX && yoffset == 0) {
|
||||
vp9_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == 0 && yoffset == HALFNDX) {
|
||||
vp9_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
|
||||
vp9_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
} else {
|
||||
vp9_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0
|
||||
);
|
||||
|
||||
vp9_filter_block2d_bil_var_sse2(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum1, &xxsum1
|
||||
);
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
*avg = xsum0;
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const uint8_t *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse_ptr) {
|
||||
int avg;
|
||||
unsigned int sse;
|
||||
|
||||
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line,
|
||||
&sse, &avg);
|
||||
*sse_ptr = sse;
|
||||
|
||||
return (sse - (((unsigned int) avg * avg) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const uint8_t *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse_ptr) {
|
||||
int avg0, avg1, avg2, avg3;
|
||||
unsigned int sse0, sse1, sse2, sse3;
|
||||
|
||||
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line,
|
||||
&sse0, &avg0);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 16, dst_pixels_per_line,
|
||||
&sse1, &avg1);
|
||||
src_ptr += 16 * src_pixels_per_line;
|
||||
dst_ptr += 16 * dst_pixels_per_line;
|
||||
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line,
|
||||
&sse2, &avg2);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 16, dst_pixels_per_line,
|
||||
&sse3, &avg3);
|
||||
sse0 += sse1 + sse2 + sse3;
|
||||
avg0 += avg1 + avg2 + avg3;
|
||||
*sse_ptr = sse0;
|
||||
|
||||
return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const uint8_t *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse_ptr) {
|
||||
int avg0, avg1, avg2, avg3, avg4;
|
||||
unsigned int sse0, sse1, sse2, sse3, sse4;
|
||||
|
||||
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line,
|
||||
&sse0, &avg0);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 16, dst_pixels_per_line,
|
||||
&sse1, &avg1);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 32, dst_pixels_per_line,
|
||||
&sse2, &avg2);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 48, dst_pixels_per_line,
|
||||
&sse3, &avg3);
|
||||
src_ptr += 16 * src_pixels_per_line;
|
||||
dst_ptr += 16 * dst_pixels_per_line;
|
||||
avg0 += avg1 + avg2 + avg3;
|
||||
sse0 += sse1 + sse2 + sse3;
|
||||
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line,
|
||||
&sse1, &avg1);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 16, dst_pixels_per_line,
|
||||
&sse2, &avg2);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 32, dst_pixels_per_line,
|
||||
&sse3, &avg3);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 48, dst_pixels_per_line,
|
||||
&sse4, &avg4);
|
||||
src_ptr += 16 * src_pixels_per_line;
|
||||
dst_ptr += 16 * dst_pixels_per_line;
|
||||
avg0 += avg1 + avg2 + avg3 + avg4;
|
||||
sse0 += sse1 + sse2 + sse3 + sse4;
|
||||
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line,
|
||||
&sse1, &avg1);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 16, dst_pixels_per_line,
|
||||
&sse2, &avg2);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 32, dst_pixels_per_line,
|
||||
&sse3, &avg3);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 48, dst_pixels_per_line,
|
||||
&sse4, &avg4);
|
||||
src_ptr += 16 * src_pixels_per_line;
|
||||
dst_ptr += 16 * dst_pixels_per_line;
|
||||
avg0 += avg1 + avg2 + avg3 + avg4;
|
||||
sse0 += sse1 + sse2 + sse3 + sse4;
|
||||
sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line,
|
||||
&sse1, &avg1);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 16, dst_pixels_per_line,
|
||||
&sse2, &avg2);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 32, dst_pixels_per_line,
|
||||
&sse3, &avg3);
|
||||
sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr + 48, dst_pixels_per_line,
|
||||
&sse4, &avg4);
|
||||
avg0 += avg1 + avg2 + avg3 + avg4;
|
||||
sse0 += sse1 + sse2 + sse3 + sse4;
|
||||
*sse_ptr = sse0;
|
||||
|
||||
return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_mse16x16_sse2(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
|
||||
yoffset, dst_ptr, dst_pixels_per_line, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x8_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
|
||||
) {
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
if (xoffset == HALFNDX && yoffset == 0) {
|
||||
vp9_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == 0 && yoffset == HALFNDX) {
|
||||
vp9_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
|
||||
vp9_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
} else {
|
||||
vp9_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0);
|
||||
|
||||
vp9_filter_block2d_bil_var_sse2(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum1, &xxsum1);
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance8x16_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
|
||||
if (xoffset == HALFNDX && yoffset == 0) {
|
||||
vp9_half_horiz_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum, &xxsum);
|
||||
} else if (xoffset == 0 && yoffset == HALFNDX) {
|
||||
vp9_half_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum, &xxsum);
|
||||
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
|
||||
vp9_half_horiz_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum, &xxsum);
|
||||
} else {
|
||||
vp9_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
|
||||
}
|
||||
FNS(sse2, sse);
|
||||
FNS(ssse3, ssse3);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
|
||||
unsigned int vp9_variance_halfpixvar16x16_h_wmt(
|
||||
const unsigned char *src_ptr,
|
||||
|
@ -1,142 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vp9/common/vp9_pragmas.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#define HALFNDX 8
|
||||
|
||||
extern void vp9_half_horiz_vert_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp9_half_horiz_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp9_half_vert_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp9_filter_block2d_bil_var_ssse3
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x16_ssse3
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
) {
|
||||
int xsum0;
|
||||
unsigned int xxsum0;
|
||||
|
||||
// note we could avoid these if statements if the calling function
|
||||
// just called the appropriate functions inside.
|
||||
if (xoffset == HALFNDX && yoffset == 0) {
|
||||
vp9_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == 0 && yoffset == HALFNDX) {
|
||||
vp9_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
|
||||
vp9_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
} else {
|
||||
vp9_filter_block2d_bil_var_ssse3(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x8_ssse3
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
|
||||
) {
|
||||
int xsum0;
|
||||
unsigned int xxsum0;
|
||||
|
||||
if (xoffset == HALFNDX && yoffset == 0) {
|
||||
vp9_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == 0 && yoffset == HALFNDX) {
|
||||
vp9_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
} else if (xoffset == HALFNDX && yoffset == HALFNDX) {
|
||||
vp9_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
} else {
|
||||
vp9_filter_block2d_bil_var_ssse3(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
|
||||
}
|
@ -85,13 +85,12 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
|
||||
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
|
||||
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
|
||||
|
Loading…
x
Reference in New Issue
Block a user