diff --git a/test/variance_test.cc b/test/variance_test.cc index dfa1a07c7..e7037d9d6 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -26,12 +26,55 @@ extern "C" { # include "vp9_rtcd.h" #endif } +#include "test/acm_random.h" namespace { using ::std::tr1::get; using ::std::tr1::make_tuple; using ::std::tr1::tuple; +using libvpx_test::ACMRandom; + +static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int diff = ref[w * y + x] - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} + +static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, int xoff, int yoff, + unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + int diff = r - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} template class VarianceTest : @@ -39,10 +82,13 @@ class VarianceTest : public: virtual void SetUp() { const tuple& params = this->GetParam(); - width_ = get<0>(params); - height_ = get<1>(params); + log2width_ = get<0>(params); + width_ = 1 << log2width_; + log2height_ = get<1>(params); + height_ = 1 << log2height_; variance_ = get<2>(params); + rnd(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; src_ = new uint8_t[block_size_]; ref_ = new uint8_t[block_size_]; @@ -58,15 +104,16 @@ class VarianceTest : protected: void ZeroTest(); + void RefTest(); void OneQuarterTest(); + ACMRandom rnd; uint8_t* src_; uint8_t* ref_; - int width_; - int height_; + int width_, log2width_; + int height_, log2height_; int block_size_; VarianceFunctionType variance_; - }; template @@ -82,6 +129,22 @@ void VarianceTest::ZeroTest() { } } +template +void VarianceTest::RefTest() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + const unsigned int var1 = variance_(src_, width_, ref_, width_, &sse1); + const unsigned int var2 = variance_ref(src_, ref_, log2width_, + log2height_, &sse2); + EXPECT_EQ(sse1, sse2); + EXPECT_EQ(var1, var2); + } +} + template void VarianceTest::OneQuarterTest() { memset(src_, 255, block_size_); @@ -94,6 +157,66 @@ void VarianceTest::OneQuarterTest() { EXPECT_EQ(expected, var); } +template +class SubpelVarianceTest : + public ::testing::TestWithParam > { + public: + virtual void SetUp() { + const tuple& params = + this->GetParam(); + log2width_ = get<0>(params); + width_ = 1 << log2width_; + log2height_ = get<1>(params); + height_ = 1 << log2height_; + subpel_variance_ = get<2>(params); + + rnd(ACMRandom::DeterministicSeed()); + block_size_ = width_ * height_; + src_ = new uint8_t[block_size_]; + ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; + ASSERT_TRUE(src_ != NULL); + ASSERT_TRUE(ref_ != NULL); + } + + virtual void TearDown() { + delete[] src_; + delete[] ref_; + } + + protected: + void RefTest(); + + ACMRandom rnd; + uint8_t* src_; + uint8_t* ref_; + int width_, log2width_; + int height_, log2height_; + int block_size_; + SubpelVarianceFunctionType subpel_variance_; +}; + +template +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 16; ++x) { + for (int y = 0; y < 16; ++y) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y, + src_, width_, &sse1); + const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_, + log2height_, x, y, &sse2); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + // ----------------------------------------------------------------------------- // VP8 test cases. @@ -103,6 +226,7 @@ namespace vp8 { typedef VarianceTest VP8VarianceTest; TEST_P(VP8VarianceTest, Zero) { ZeroTest(); } +TEST_P(VP8VarianceTest, Ref) { RefTest(); } TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); } const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c; @@ -112,11 +236,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c; const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c; INSTANTIATE_TEST_CASE_P( C, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_c), - make_tuple(8, 8, variance8x8_c), - make_tuple(8, 16, variance8x16_c), - make_tuple(16, 8, variance16x8_c), - make_tuple(16, 16, variance16x16_c))); + ::testing::Values(make_tuple(2, 2, variance4x4_c), + make_tuple(3, 3, variance8x8_c), + make_tuple(3, 4, variance8x16_c), + make_tuple(4, 3, variance16x8_c), + make_tuple(4, 4, variance16x16_c))); #if HAVE_MMX const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx; @@ -126,11 +250,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx; const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx; INSTANTIATE_TEST_CASE_P( MMX, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_mmx), - make_tuple(8, 8, variance8x8_mmx), - make_tuple(8, 16, variance8x16_mmx), - make_tuple(16, 8, variance16x8_mmx), - make_tuple(16, 16, variance16x16_mmx))); + ::testing::Values(make_tuple(2, 2, variance4x4_mmx), + make_tuple(3, 3, variance8x8_mmx), + make_tuple(3, 4, variance8x16_mmx), + make_tuple(4, 3, variance16x8_mmx), + make_tuple(4, 4, variance16x16_mmx))); #endif #if HAVE_SSE2 @@ -141,11 +265,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt; const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt; INSTANTIATE_TEST_CASE_P( SSE2, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_wmt), - make_tuple(8, 8, variance8x8_wmt), - make_tuple(8, 16, variance8x16_wmt), - make_tuple(16, 8, variance16x8_wmt), - make_tuple(16, 16, variance16x16_wmt))); + ::testing::Values(make_tuple(2, 2, variance4x4_wmt), + make_tuple(3, 3, variance8x8_wmt), + make_tuple(3, 4, variance8x16_wmt), + make_tuple(4, 3, variance16x8_wmt), + make_tuple(4, 4, variance16x16_wmt))); #endif #endif // CONFIG_VP8_ENCODER @@ -158,22 +282,83 @@ namespace vp9 { #if CONFIG_VP9_ENCODER typedef VarianceTest VP9VarianceTest; +typedef SubpelVarianceTest VP9SubpelVarianceTest; TEST_P(VP9VarianceTest, Zero) { ZeroTest(); } +TEST_P(VP9VarianceTest, Ref) { RefTest(); } +TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); } const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c; +const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c; +const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c; const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c; const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c; const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c; const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c; +const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c; +const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c; +const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c; +const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c; +const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c; +const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c; INSTANTIATE_TEST_CASE_P( C, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_c), - make_tuple(8, 8, variance8x8_c), - make_tuple(8, 16, variance8x16_c), - make_tuple(16, 8, variance16x8_c), - make_tuple(16, 16, variance16x16_c))); + ::testing::Values(make_tuple(2, 2, variance4x4_c), + make_tuple(2, 3, variance4x8_c), + make_tuple(3, 2, variance8x4_c), + make_tuple(3, 3, variance8x8_c), + make_tuple(3, 4, variance8x16_c), + make_tuple(4, 3, variance16x8_c), + make_tuple(4, 4, variance16x16_c), + make_tuple(4, 5, variance16x32_c), + make_tuple(5, 4, variance32x16_c), + make_tuple(5, 5, variance32x32_c), + make_tuple(5, 6, variance32x64_c), + make_tuple(6, 5, variance64x32_c), + make_tuple(6, 6, variance64x64_c))); + +const vp9_subpixvariance_fn_t subpel_variance4x4_c = + vp9_sub_pixel_variance4x4_c; +const vp9_subpixvariance_fn_t subpel_variance4x8_c = + vp9_sub_pixel_variance4x8_c; +const vp9_subpixvariance_fn_t subpel_variance8x4_c = + vp9_sub_pixel_variance8x4_c; +const vp9_subpixvariance_fn_t subpel_variance8x8_c = + vp9_sub_pixel_variance8x8_c; +const vp9_subpixvariance_fn_t subpel_variance8x16_c = + vp9_sub_pixel_variance8x16_c; +const vp9_subpixvariance_fn_t subpel_variance16x8_c = + vp9_sub_pixel_variance16x8_c; +const vp9_subpixvariance_fn_t subpel_variance16x16_c = + vp9_sub_pixel_variance16x16_c; +const vp9_subpixvariance_fn_t subpel_variance16x32_c = + vp9_sub_pixel_variance16x32_c; +const vp9_subpixvariance_fn_t subpel_variance32x16_c = + vp9_sub_pixel_variance32x16_c; +const vp9_subpixvariance_fn_t subpel_variance32x32_c = + vp9_sub_pixel_variance32x32_c; +const vp9_subpixvariance_fn_t subpel_variance32x64_c = + vp9_sub_pixel_variance32x64_c; +const vp9_subpixvariance_fn_t subpel_variance64x32_c = + vp9_sub_pixel_variance64x32_c; +const vp9_subpixvariance_fn_t subpel_variance64x64_c = + vp9_sub_pixel_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c), + make_tuple(2, 3, subpel_variance4x8_c), + make_tuple(3, 2, subpel_variance8x4_c), + make_tuple(3, 3, subpel_variance8x8_c), + make_tuple(3, 4, subpel_variance8x16_c), + make_tuple(4, 3, subpel_variance16x8_c), + make_tuple(4, 4, subpel_variance16x16_c), + make_tuple(4, 5, subpel_variance16x32_c), + make_tuple(5, 4, subpel_variance32x16_c), + make_tuple(5, 5, subpel_variance32x32_c), + make_tuple(5, 6, subpel_variance32x64_c), + make_tuple(6, 5, subpel_variance64x32_c), + make_tuple(6, 6, subpel_variance64x64_c))); #if HAVE_MMX const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx; @@ -183,26 +368,128 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx; const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx; INSTANTIATE_TEST_CASE_P( MMX, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_mmx), - make_tuple(8, 8, variance8x8_mmx), - make_tuple(8, 16, variance8x16_mmx), - make_tuple(16, 8, variance16x8_mmx), - make_tuple(16, 16, variance16x16_mmx))); + ::testing::Values(make_tuple(2, 2, variance4x4_mmx), + make_tuple(3, 3, variance8x8_mmx), + make_tuple(3, 4, variance8x16_mmx), + make_tuple(4, 3, variance16x8_mmx), + make_tuple(4, 4, variance16x16_mmx))); #endif #if HAVE_SSE2 -const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2; -const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2; -const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2; -const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2; -const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2; +const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; +const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; +const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; +const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2; +const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2; +const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2; +const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2; +const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2; +const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2; +const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2; +const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2; +const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2; +const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2; INSTANTIATE_TEST_CASE_P( SSE2, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_wmt), - make_tuple(8, 8, variance8x8_wmt), - make_tuple(8, 16, variance8x16_wmt), - make_tuple(16, 8, variance16x8_wmt), - make_tuple(16, 16, variance16x16_wmt))); + ::testing::Values(make_tuple(2, 2, variance4x4_sse2), + make_tuple(2, 3, variance4x8_sse2), + make_tuple(3, 2, variance8x4_sse2), + make_tuple(3, 3, variance8x8_sse2), + make_tuple(3, 4, variance8x16_sse2), + make_tuple(4, 3, variance16x8_sse2), + make_tuple(4, 4, variance16x16_sse2), + make_tuple(4, 5, variance16x32_sse2), + make_tuple(5, 4, variance32x16_sse2), + make_tuple(5, 5, variance32x32_sse2), + make_tuple(5, 6, variance32x64_sse2), + make_tuple(6, 5, variance64x32_sse2), + make_tuple(6, 6, variance64x64_sse2))); + +const vp9_subpixvariance_fn_t subpel_variance4x4_sse = + vp9_sub_pixel_variance4x4_sse; +const vp9_subpixvariance_fn_t subpel_variance4x8_sse = + vp9_sub_pixel_variance4x8_sse; +const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 = + vp9_sub_pixel_variance8x4_sse2; +const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 = + vp9_sub_pixel_variance8x8_sse2; +const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 = + vp9_sub_pixel_variance8x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 = + vp9_sub_pixel_variance16x8_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 = + vp9_sub_pixel_variance16x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 = + vp9_sub_pixel_variance16x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 = + vp9_sub_pixel_variance32x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 = + vp9_sub_pixel_variance32x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 = + vp9_sub_pixel_variance32x64_sse2; +const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 = + vp9_sub_pixel_variance64x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 = + vp9_sub_pixel_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse), + make_tuple(2, 3, subpel_variance4x8_sse), + make_tuple(3, 2, subpel_variance8x4_sse2), + make_tuple(3, 3, subpel_variance8x8_sse2), + make_tuple(3, 4, subpel_variance8x16_sse2), + make_tuple(4, 3, subpel_variance16x8_sse2), + make_tuple(4, 4, subpel_variance16x16_sse2), + make_tuple(4, 5, subpel_variance16x32_sse2), + make_tuple(5, 4, subpel_variance32x16_sse2), + make_tuple(5, 5, subpel_variance32x32_sse2), + make_tuple(5, 6, subpel_variance32x64_sse2), + make_tuple(6, 5, subpel_variance64x32_sse2), + make_tuple(6, 6, subpel_variance64x64_sse2))); +#endif + +#if HAVE_SSSE3 +const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 = + vp9_sub_pixel_variance4x4_ssse3; +const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 = + vp9_sub_pixel_variance4x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 = + vp9_sub_pixel_variance8x4_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 = + vp9_sub_pixel_variance8x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 = + vp9_sub_pixel_variance8x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 = + vp9_sub_pixel_variance16x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 = + vp9_sub_pixel_variance16x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 = + vp9_sub_pixel_variance16x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 = + vp9_sub_pixel_variance32x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 = + vp9_sub_pixel_variance32x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 = + vp9_sub_pixel_variance32x64_ssse3; +const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 = + vp9_sub_pixel_variance64x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 = + vp9_sub_pixel_variance64x64_ssse3; +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3), + make_tuple(2, 3, subpel_variance4x8_ssse3), + make_tuple(3, 2, subpel_variance8x4_ssse3), + make_tuple(3, 3, subpel_variance8x8_ssse3), + make_tuple(3, 4, subpel_variance8x16_ssse3), + make_tuple(4, 3, subpel_variance16x8_ssse3), + make_tuple(4, 4, subpel_variance16x16_ssse3), + make_tuple(4, 5, subpel_variance16x32_ssse3), + make_tuple(5, 4, subpel_variance32x16_ssse3), + make_tuple(5, 5, subpel_variance32x32_ssse3), + make_tuple(5, 6, subpel_variance32x64_ssse3), + make_tuple(6, 5, subpel_variance64x32_ssse3), + make_tuple(6, 6, subpel_variance64x64_ssse3))); #endif #endif // CONFIG_VP9_ENCODER diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index a405aab8d..575b619a1 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -266,85 +266,81 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid specialize vp9_variance4x4 mmx sse2 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 sse2 +specialize vp9_sub_pixel_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance64x64 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x64 +specialize vp9_sub_pixel_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x64 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x32 +specialize vp9_sub_pixel_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance64x32 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x16 +specialize vp9_sub_pixel_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x16 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x32 +specialize vp9_sub_pixel_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x32 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 sse2 +specialize vp9_sub_pixel_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x32 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 +specialize vp9_sub_pixel_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x16 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x16 sse2 mmx -vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt +specialize vp9_sub_pixel_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x16 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt +specialize vp9_sub_pixel_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x8 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x8 sse2 mmx -vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt +specialize vp9_sub_pixel_variance8x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x8 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x4 +specialize vp9_sub_pixel_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x4 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x8 +specialize vp9_sub_pixel_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance4x8 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x4 sse2 mmx -vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt +specialize vp9_sub_pixel_variance4x4 sse ssse3 +#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance4x4 @@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co specialize vp9_sad4x4 mmx sse prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_h mmx sse2 +specialize vp9_variance_halfpixvar16x16_h sse2 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_v mmx sse2 +specialize vp9_variance_halfpixvar16x16_v sse2 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_hv mmx sse2 +specialize vp9_variance_halfpixvar16x16_hv sse2 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" @@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse -prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" -specialize vp9_sub_pixel_mse16x16 sse2 mmx +#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" +#specialize vp9_sub_pixel_mse16x16 sse2 mmx prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" specialize vp9_mse16x16 mmx sse2 diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm new file mode 100644 index 000000000..35014cec8 --- /dev/null +++ b/vp9/encoder/x86/vp9_subpel_variance.asm @@ -0,0 +1,1061 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 15 + times 8 dw 1 + times 8 dw 14 + times 8 dw 2 + times 8 dw 13 + times 8 dw 3 + times 8 dw 12 + times 8 dw 4 + times 8 dw 11 + times 8 dw 5 + times 8 dw 10 + times 8 dw 6 + times 8 dw 9 + times 8 dw 7 + times 16 dw 8 + times 8 dw 7 + times 8 dw 9 + times 8 dw 6 + times 8 dw 10 + times 8 dw 5 + times 8 dw 11 + times 8 dw 4 + times 8 dw 12 + times 8 dw 3 + times 8 dw 13 + times 8 dw 2 + times 8 dw 14 + times 8 dw 1 + times 8 dw 15 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 15, 1 + times 8 db 14, 2 + times 8 db 13, 3 + times 8 db 12, 4 + times 8 db 11, 5 + times 8 db 10, 6 + times 8 db 9, 7 + times 16 db 8 + times 8 db 7, 9 + times 8 db 6, 10 + times 8 db 5, 11 + times 8 db 4, 12 + times 8 db 3, 13 + times 8 db 2, 14 + times 8 db 1, 15 + +; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd rax, m6 ; store sum as return value +%else ; mmsize == 8 + pshufw m4, m6, 0xe + pshufw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshufw m4, m6, 0xe + paddd m6, m4 + movd rax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro SUBPEL_VARIANCE 1 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 +%ifdef PIC +cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ + dst, dst_stride, height, sse +%define bilin_filter sseq +%else +cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ + dst, dst_stride, height, sse +%define bilin_filter bilin_filter_m +%endif + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar heightd, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movh m1, [dstq] + pavgb m0, m2 + movh m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movh m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movh m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m4, [srcq+1] + movh m2, [srcq+src_strideq] + movh m1, [dstq] + pavgb m0, m4 + movh m4, [srcq+src_strideq+1] + movh m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + pavgb m0, m4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movh m2, [srcq] + movh m3, [srcq+1] + movh m4, [srcq+src_strideq] + movh m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movh m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movh m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] + movh m2, [srcq+src_strideq] + movh m4, [srcq+src_strideq+1] + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movh m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movh m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movh m1, [dstq] + paddw m4, m3 + movh m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq + dec heightd +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + add srcq, src_strideq +.x_other_y_other_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movh m3, [dstq+dst_strideq] + movh m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movh m3, [dstq+dst_strideq] + paddw m2, m1 + movh m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] + dec heightd +%endif + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_MMX sse +SUBPEL_VARIANCE 4 +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_MMX ssse3 +SUBPEL_VARIANCE 4 +INIT_XMM ssse3 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm index 8a2a471f5..2ecc23e55 100644 --- a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm +++ b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm @@ -8,292 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; - %include "vpx_ports/x86_abi_support.asm" -%define xmm_filter_shift 7 - -;void vp9_filter_block2d_bil_var_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE -sym(vp9_filter_block2d_bil_var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - push rbx - ; end prolog - - pxor xmm6, xmm6 ; - pxor xmm7, xmm7 ; - - lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding - movdqa xmm4, XMMWORD PTR [rsi] - - lea rcx, [GLOBAL(bilinear_filters_sse2)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je filter_block2d_bil_var_sse2_sp_only - - shl rax, 5 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je filter_block2d_bil_var_sse2_fp_only - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - movdqa xmm5, xmm1 - - movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line - lea rsi, [rsi + rbx] -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -filter_block2d_bil_var_sse2_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movdqa xmm3, xmm5 ; - movdqa xmm5, xmm1 ; - - pmullw xmm3, [rdx] ; - pmullw xmm1, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rbx] ;ref_pixels_per_line -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 ; - jnz filter_block2d_bil_var_sse2_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 - je filter_block2d_bil_var_sse2_full_pixel - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + rax] - -filter_block2d_bil_sp_only_loop: - movq xmm3, QWORD PTR [rsi] ; - punpcklbw xmm3, xmm0 ; - movdqa xmm5, xmm3 - - pmullw xmm1, [rdx] ; - pmullw xmm3, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - movdqa xmm1, xmm5 ; - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_sp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 ; - -filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movq xmm2, QWORD PTR [rdi] ; - punpcklbw xmm2, xmm0 ; - - psubw xmm1, xmm2 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_full_pixel_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - -filter_block2d_bil_fp_only_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - lea rsi, [rsi + rdx] - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_fp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_variance: - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(7) ; sum - mov rdi, arg(8) ; sumsquared - - movd [rsi], mm2 ; xsum - movd [rdi], mm4 ; xxsum - - ; begin epilog - pop rbx - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - ;void vp9_half_horiz_vert_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, @@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2): UNSHADOW_ARGS pop rbp ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 - dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm index 9f140c96b..d3dbefed8 100644 --- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm +++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm @@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx): UNSHADOW_ARGS pop rbp ret - -%define mmx_filter_shift 7 - -;void vp9_filter_block2d_bil4x4_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE -sym(vp9_filter_block2d_bil4x4_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - - mov rax, arg(4) ;HFilter ; - mov rdx, arg(5) ;VFilter ; - - mov rsi, arg(0) ;ref_ptr ; - mov rdi, arg(2) ;src_ptr ; - - mov rcx, 4 ; - pxor mm0, mm0 ; - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm5, mm1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 -%endif - -.filter_block2d_bil4x4_var_mmx_loop: - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm3, mm5 ; - - movq mm5, mm1 ; - pmullw mm3, [rdx] ; - - pmullw mm1, [rdx+8] ; - paddw mm1, mm3 ; - - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - movd mm3, [rdi] ; - punpcklbw mm3, mm0 ; - - psubw mm1, mm3 ; - paddw mm6, mm1 ; - - pmaddwd mm1, mm1 ; - paddd mm7, mm1 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil4x4_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(6) ;sum - mov rsi, arg(7) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - - -;void vp9_filter_block2d_bil_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE -sym(vp9_filter_block2d_bil_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - mov rax, arg(5) ;HFilter ; - - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor mm0, mm0 ; - movq mm1, [rsi] ; - - movq mm3, [rsi+1] ; - movq mm2, mm1 ; - - movq mm4, mm3 ; - punpcklbw mm1, mm0 ; - - punpckhbw mm2, mm0 ; - pmullw mm1, [rax] ; - - pmullw mm2, [rax] ; - punpcklbw mm3, mm0 ; - - punpckhbw mm4, mm0 ; - pmullw mm3, [rax+8] ; - - pmullw mm4, [rax+8] ; - paddw mm1, mm3 ; - - paddw mm2, mm4 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm2, mmx_filter_shift ; - movq mm5, mm1 - - packuswb mm5, mm2 ; -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - add rsi, r8 -%endif - -.filter_block2d_bil_var_mmx_loop: - - movq mm1, [rsi] ; - movq mm3, [rsi+1] ; - - movq mm2, mm1 ; - movq mm4, mm3 ; - - punpcklbw mm1, mm0 ; - punpckhbw mm2, mm0 ; - - pmullw mm1, [rax] ; - pmullw mm2, [rax] ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - pmullw mm3, [rax+8] ; - pmullw mm4, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - psraw mm2, mmx_filter_shift ; - - movq mm3, mm5 ; - movq mm4, mm5 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - movq mm5, mm1 ; - packuswb mm5, mm2 ; - - pmullw mm3, [rdx] ; - pmullw mm4, [rdx] ; - - pmullw mm1, [rdx+8] ; - pmullw mm2, [rdx+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - psraw mm2, mmx_filter_shift ; - - movq mm3, [rdi] ; - movq mm4, mm3 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - psubw mm1, mm3 ; - psubw mm2, mm4 ; - - paddw mm6, mm1 ; - pmaddwd mm1, mm1 ; - - paddw mm6, mm2 ; - pmaddwd mm2, mm2 ; - - paddd mm7, mm1 ; - paddd mm7, mm2 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(7) ;sum - mov rsi, arg(8) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -;short mmx_bi_rd[4] = { 64, 64, 64, 64}; -align 16 -mmx_bi_rd: - times 4 dw 64 diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm index 896dd185d..2c5088134 100644 --- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm +++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm @@ -11,8 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -%define xmm_filter_shift 7 - ;unsigned int vp9_get_mb_ss_sse2 ;( ; short *src_ptr @@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2): UNSHADOW_ARGS pop rbp ret - - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 - dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/vp9/encoder/x86/vp9_variance_impl_ssse3.asm deleted file mode 100644 index 98a4a16f6..000000000 --- a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm +++ /dev/null @@ -1,372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define xmm_filter_shift 7 - - -;void vp9_filter_block2d_bil_var_ssse3 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -;Note: The filter coefficient at offset=0 is 128. Since the second register -;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. -global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE -sym(vp9_filter_block2d_bil_var_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 - pxor xmm7, xmm7 - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .filter_block2d_bil_var_ssse3_sp_only - - shl rax, 4 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je .filter_block2d_bil_var_ssse3_fp_only - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi+1] - movdqa xmm2, xmm0 - - punpcklbw xmm0, xmm1 - punpckhbw xmm2, xmm1 - pmaddubsw xmm0, [rax] - pmaddubsw xmm2, [rax] - - paddw xmm0, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm0, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - packuswb xmm0, xmm2 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + r8] -%endif - -.filter_block2d_bil_var_ssse3_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - packuswb xmm1, xmm3 - - movdqa xmm2, xmm0 - movdqa xmm0, xmm1 - movdqa xmm3, xmm2 - - punpcklbw xmm2, xmm1 - punpckhbw xmm3, xmm1 - pmaddubsw xmm2, [rdx] - pmaddubsw xmm3, [rdx] - - paddw xmm2, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm2, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm1, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm1, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm2, xmm1 - psubw xmm3, xmm5 - paddw xmm6, xmm2 - paddw xmm6, xmm3 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm2 - paddd xmm7, xmm3 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rsi, [rsi + r8] - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_var_ssse3_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; Both xoffset =0 and yoffset=0 - je .filter_block2d_bil_var_ssse3_full_pixel - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - movdqu xmm1, XMMWORD PTR [rsi] - movdqa xmm0, xmm1 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - lea rsi, [rsi + rax] - -.filter_block2d_bil_sp_only_loop: - movdqu xmm3, XMMWORD PTR [rsi] - movdqa xmm2, xmm1 - movdqa xmm0, xmm3 - - punpcklbw xmm1, xmm3 - punpckhbw xmm2, xmm3 - pmaddubsw xmm1, [rdx] - pmaddubsw xmm2, [rdx] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - movq xmm3, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm3, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm3 - psubw xmm2, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - movdqa xmm1, xmm0 - lea rsi, [rsi + rax] ;ref_pixels_per_line - -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_sp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 - -.filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] - punpcklbw xmm1, xmm0 - movq xmm2, QWORD PTR [rsi+8] - punpcklbw xmm2, xmm0 - - movq xmm3, QWORD PTR [rdi] - punpcklbw xmm3, xmm0 - movq xmm4, QWORD PTR [rdi+8] - punpcklbw xmm4, xmm0 - - psubw xmm1, xmm3 - psubw xmm2, xmm4 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rdx] ;src_pixels_per_line - sub rcx, 1 - jnz .filter_block2d_bil_full_pixel_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -.filter_block2d_bil_fp_only_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm2, XMMWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm2, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm2 - psubw xmm3, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm3 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm1 - paddd xmm7, xmm3 - - lea rsi, [rsi + rdx] -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_fp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_variance: - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(7) ;[Sum] - mov rdi, arg(8) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c index bad1cfa74..d1415606e 100644 --- a/vp9/encoder/x86/vp9_variance_mmx.c +++ b/vp9/encoder/x86/vp9_variance_mmx.c @@ -13,27 +13,6 @@ #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern void filter_block1d_h6_mmx -( - const unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); -extern void filter_block1d_v6_mmx -( - const short *src_ptr, - unsigned char *output_ptr, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); - extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr); extern unsigned int vp9_get8x8var_mmx ( @@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx unsigned int *SSE, int *Sum ); -extern void vp9_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); -extern void vp9_filter_block2d_bil_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - unsigned int vp9_variance4x4_mmx( const unsigned char *src_ptr, @@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx( return (var - (((unsigned int)avg * avg) >> 7)); } - -DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]); - -unsigned int vp9_sub_pixel_variance4x4_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) - -{ - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); -} - - -unsigned int vp9_sub_pixel_variance8x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -unsigned int vp9_sub_pixel_variance16x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); - - -} - -unsigned int vp9_sub_pixel_mse16x16_mmx( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp9_sub_pixel_variance8x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); -} - - -unsigned int vp9_variance_halfpixvar16x16_h_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_v_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_hv_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index 67ca9257c..68c805e23 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -9,29 +9,11 @@ */ #include "vpx_config.h" + #include "vp9/encoder/vp9_variance.h" #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -#define HALFNDX 8 - -extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); - -extern void vp9_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - extern unsigned int vp9_get4x4var_mmx ( const unsigned char *src_ptr, @@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2 unsigned int *SSE, int *Sum ); -void vp9_filter_block2d_bil_var_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); void vp9_half_horiz_vert_variance8x_h_sse2 ( const unsigned char *ref_ptr, @@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]); - typedef unsigned int (*get_var_sse2) ( const unsigned char *src_ptr, int source_stride, @@ -375,347 +343,89 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, return (var - (((int64_t)avg * avg) >> 11)); } -unsigned int vp9_sub_pixel_variance4x4_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +#define DECLS(opt1, opt2) \ +int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse); \ +int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse); \ +int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sse_ptr) { \ + unsigned int sse; \ + int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ } +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ +FN(16, 8, 16, 4, 3, opt1,); \ +FN(8, 16, 8, 3, 4, opt1,); \ +FN(8, 8, 8, 3, 3, opt1,); \ +FN(8, 4, 8, 3, 2, opt1,); \ +FN(4, 8, 4, 2, 3, opt2,); \ +FN(4, 4, 4, 2, 2, opt2,) -unsigned int vp9_sub_pixel_variance8x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, int *avg) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - // note we could avoid these if statements if the calling function - // just called the appropriate functions inside. - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0 - ); - - vp9_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum1, &xxsum1 - ); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - *avg = xsum0; -} - -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg; - unsigned int sse; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse, &avg); - *sse_ptr = sse; - - return (sse - (((unsigned int) avg * avg) >> 8)); -} - -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg0, avg1, avg2, avg3; - unsigned int sse0, sse1, sse2, sse3; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse0, &avg0); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse1, &avg1); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse3, &avg3); - sse0 += sse1 + sse2 + sse3; - avg0 += avg1 + avg2 + avg3; - *sse_ptr = sse0; - - return (sse0 - (((unsigned int) avg0 * avg0) >> 10)); -} - -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg0, avg1, avg2, avg3, avg4; - unsigned int sse0, sse1, sse2, sse3, sse4; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse0, &avg0); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse3, &avg3); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3; - sse0 += sse1 + sse2 + sse3; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - *sse_ptr = sse0; - - return (sse0 - (((unsigned int) avg0 * avg0) >> 12)); -} - -unsigned int vp9_sub_pixel_mse16x16_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - - vp9_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum1, &xxsum1); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp9_sub_pixel_variance8x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); -} +FNS(sse2, sse); +FNS(ssse3, ssse3); +#undef FNS +#undef FN unsigned int vp9_variance_halfpixvar16x16_h_wmt( const unsigned char *src_ptr, diff --git a/vp9/encoder/x86/vp9_variance_ssse3.c b/vp9/encoder/x86/vp9_variance_ssse3.c deleted file mode 100644 index 882acad78..000000000 --- a/vp9/encoder/x86/vp9_variance_ssse3.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" -#include "vpx_ports/mem.h" - -#define HALFNDX 8 - -extern void vp9_half_horiz_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_half_horiz_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_half_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_filter_block2d_bil_var_ssse3 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); - -unsigned int vp9_sub_pixel_variance16x16_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum0; - unsigned int xxsum0; - - // note we could avoid these if statements if the calling function - // just called the appropriate functions inside. - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - -unsigned int vp9_sub_pixel_variance16x8_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) { - int xsum0; - unsigned int xxsum0; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 4bed6c0d7..9ce91544f 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -85,13 +85,12 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm